vllm-cpu-avx512vnni 0.13.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1641) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +1260 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +3080 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +443 -0
  16. vllm/attention/backends/registry.py +254 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +969 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +120 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/layers/mm_encoder_attention.py +284 -0
  24. vllm/attention/ops/__init__.py +0 -0
  25. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  26. vllm/attention/ops/common.py +469 -0
  27. vllm/attention/ops/flashmla.py +251 -0
  28. vllm/attention/ops/merge_attn_states.py +47 -0
  29. vllm/attention/ops/paged_attn.py +51 -0
  30. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  31. vllm/attention/ops/prefix_prefill.py +814 -0
  32. vllm/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  33. vllm/attention/ops/triton_decode_attention.py +712 -0
  34. vllm/attention/ops/triton_merge_attn_states.py +116 -0
  35. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  36. vllm/attention/ops/triton_unified_attention.py +1047 -0
  37. vllm/attention/ops/vit_attn_wrappers.py +139 -0
  38. vllm/attention/selector.py +145 -0
  39. vllm/attention/utils/__init__.py +0 -0
  40. vllm/attention/utils/fa_utils.py +118 -0
  41. vllm/attention/utils/kv_sharing_utils.py +33 -0
  42. vllm/attention/utils/kv_transfer_utils.py +60 -0
  43. vllm/beam_search.py +88 -0
  44. vllm/benchmarks/__init__.py +0 -0
  45. vllm/benchmarks/datasets.py +3228 -0
  46. vllm/benchmarks/latency.py +170 -0
  47. vllm/benchmarks/lib/__init__.py +3 -0
  48. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  49. vllm/benchmarks/lib/ready_checker.py +72 -0
  50. vllm/benchmarks/lib/utils.py +79 -0
  51. vllm/benchmarks/serve.py +1538 -0
  52. vllm/benchmarks/startup.py +326 -0
  53. vllm/benchmarks/sweep/__init__.py +0 -0
  54. vllm/benchmarks/sweep/cli.py +41 -0
  55. vllm/benchmarks/sweep/param_sweep.py +158 -0
  56. vllm/benchmarks/sweep/plot.py +675 -0
  57. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  58. vllm/benchmarks/sweep/serve.py +450 -0
  59. vllm/benchmarks/sweep/serve_sla.py +492 -0
  60. vllm/benchmarks/sweep/server.py +114 -0
  61. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  62. vllm/benchmarks/sweep/utils.py +4 -0
  63. vllm/benchmarks/throughput.py +808 -0
  64. vllm/collect_env.py +857 -0
  65. vllm/compilation/__init__.py +0 -0
  66. vllm/compilation/activation_quant_fusion.py +209 -0
  67. vllm/compilation/backends.py +839 -0
  68. vllm/compilation/base_static_graph.py +57 -0
  69. vllm/compilation/caching.py +180 -0
  70. vllm/compilation/collective_fusion.py +1215 -0
  71. vllm/compilation/compiler_interface.py +639 -0
  72. vllm/compilation/counter.py +48 -0
  73. vllm/compilation/cuda_graph.py +302 -0
  74. vllm/compilation/decorators.py +626 -0
  75. vllm/compilation/fix_functionalization.py +266 -0
  76. vllm/compilation/fusion.py +550 -0
  77. vllm/compilation/fusion_attn.py +359 -0
  78. vllm/compilation/fx_utils.py +91 -0
  79. vllm/compilation/inductor_pass.py +138 -0
  80. vllm/compilation/matcher_utils.py +361 -0
  81. vllm/compilation/monitor.py +62 -0
  82. vllm/compilation/noop_elimination.py +130 -0
  83. vllm/compilation/partition_rules.py +72 -0
  84. vllm/compilation/pass_manager.py +155 -0
  85. vllm/compilation/piecewise_backend.py +178 -0
  86. vllm/compilation/post_cleanup.py +21 -0
  87. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  88. vllm/compilation/rocm_aiter_fusion.py +242 -0
  89. vllm/compilation/sequence_parallelism.py +364 -0
  90. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  91. vllm/compilation/vllm_inductor_pass.py +173 -0
  92. vllm/compilation/wrapper.py +319 -0
  93. vllm/config/__init__.py +108 -0
  94. vllm/config/attention.py +114 -0
  95. vllm/config/cache.py +232 -0
  96. vllm/config/compilation.py +1140 -0
  97. vllm/config/device.py +75 -0
  98. vllm/config/ec_transfer.py +110 -0
  99. vllm/config/kv_events.py +56 -0
  100. vllm/config/kv_transfer.py +119 -0
  101. vllm/config/load.py +124 -0
  102. vllm/config/lora.py +96 -0
  103. vllm/config/model.py +2190 -0
  104. vllm/config/multimodal.py +247 -0
  105. vllm/config/observability.py +140 -0
  106. vllm/config/parallel.py +660 -0
  107. vllm/config/pooler.py +126 -0
  108. vllm/config/profiler.py +199 -0
  109. vllm/config/scheduler.py +299 -0
  110. vllm/config/speculative.py +644 -0
  111. vllm/config/speech_to_text.py +38 -0
  112. vllm/config/structured_outputs.py +78 -0
  113. vllm/config/utils.py +370 -0
  114. vllm/config/vllm.py +1434 -0
  115. vllm/connections.py +189 -0
  116. vllm/device_allocator/__init__.py +0 -0
  117. vllm/device_allocator/cumem.py +327 -0
  118. vllm/distributed/__init__.py +6 -0
  119. vllm/distributed/communication_op.py +43 -0
  120. vllm/distributed/device_communicators/__init__.py +0 -0
  121. vllm/distributed/device_communicators/all2all.py +490 -0
  122. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  123. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  124. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  125. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  126. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  127. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  128. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  129. vllm/distributed/device_communicators/pynccl.py +386 -0
  130. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  131. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  132. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  133. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  134. vllm/distributed/device_communicators/shm_broadcast.py +778 -0
  135. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  136. vllm/distributed/device_communicators/symm_mem.py +156 -0
  137. vllm/distributed/device_communicators/tpu_communicator.py +99 -0
  138. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  139. vllm/distributed/ec_transfer/__init__.py +14 -0
  140. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  141. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  142. vllm/distributed/ec_transfer/ec_connector/example_connector.py +201 -0
  143. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  144. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  145. vllm/distributed/eplb/__init__.py +3 -0
  146. vllm/distributed/eplb/async_worker.py +115 -0
  147. vllm/distributed/eplb/eplb_state.py +1164 -0
  148. vllm/distributed/eplb/policy/__init__.py +19 -0
  149. vllm/distributed/eplb/policy/abstract.py +40 -0
  150. vllm/distributed/eplb/policy/default.py +267 -0
  151. vllm/distributed/eplb/rebalance_execute.py +529 -0
  152. vllm/distributed/kv_events.py +499 -0
  153. vllm/distributed/kv_transfer/README.md +29 -0
  154. vllm/distributed/kv_transfer/__init__.py +20 -0
  155. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  156. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  157. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  158. vllm/distributed/kv_transfer/kv_connector/factory.py +197 -0
  159. vllm/distributed/kv_transfer/kv_connector/utils.py +322 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/base.py +597 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +450 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +327 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +378 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1418 -0
  169. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +895 -0
  170. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +186 -0
  171. vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +914 -0
  172. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +464 -0
  173. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2526 -0
  174. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +538 -0
  175. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  176. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  177. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  178. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  179. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  180. vllm/distributed/parallel_state.py +1795 -0
  181. vllm/distributed/tpu_distributed_utils.py +188 -0
  182. vllm/distributed/utils.py +545 -0
  183. vllm/engine/__init__.py +0 -0
  184. vllm/engine/arg_utils.py +2068 -0
  185. vllm/engine/async_llm_engine.py +6 -0
  186. vllm/engine/llm_engine.py +6 -0
  187. vllm/engine/protocol.py +190 -0
  188. vllm/entrypoints/__init__.py +0 -0
  189. vllm/entrypoints/anthropic/__init__.py +0 -0
  190. vllm/entrypoints/anthropic/protocol.py +162 -0
  191. vllm/entrypoints/anthropic/serving_messages.py +468 -0
  192. vllm/entrypoints/api_server.py +185 -0
  193. vllm/entrypoints/chat_utils.py +1903 -0
  194. vllm/entrypoints/cli/__init__.py +15 -0
  195. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  196. vllm/entrypoints/cli/benchmark/base.py +25 -0
  197. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  198. vllm/entrypoints/cli/benchmark/main.py +56 -0
  199. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  200. vllm/entrypoints/cli/benchmark/startup.py +21 -0
  201. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  202. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  203. vllm/entrypoints/cli/collect_env.py +38 -0
  204. vllm/entrypoints/cli/main.py +79 -0
  205. vllm/entrypoints/cli/openai.py +260 -0
  206. vllm/entrypoints/cli/run_batch.py +68 -0
  207. vllm/entrypoints/cli/serve.py +249 -0
  208. vllm/entrypoints/cli/types.py +29 -0
  209. vllm/entrypoints/constants.py +12 -0
  210. vllm/entrypoints/context.py +835 -0
  211. vllm/entrypoints/launcher.py +175 -0
  212. vllm/entrypoints/llm.py +1790 -0
  213. vllm/entrypoints/logger.py +84 -0
  214. vllm/entrypoints/openai/__init__.py +0 -0
  215. vllm/entrypoints/openai/api_server.py +1469 -0
  216. vllm/entrypoints/openai/cli_args.py +302 -0
  217. vllm/entrypoints/openai/orca_metrics.py +120 -0
  218. vllm/entrypoints/openai/parser/__init__.py +0 -0
  219. vllm/entrypoints/openai/parser/harmony_utils.py +825 -0
  220. vllm/entrypoints/openai/parser/responses_parser.py +135 -0
  221. vllm/entrypoints/openai/protocol.py +2496 -0
  222. vllm/entrypoints/openai/run_batch.py +631 -0
  223. vllm/entrypoints/openai/serving_chat.py +1822 -0
  224. vllm/entrypoints/openai/serving_completion.py +729 -0
  225. vllm/entrypoints/openai/serving_engine.py +1542 -0
  226. vllm/entrypoints/openai/serving_models.py +304 -0
  227. vllm/entrypoints/openai/serving_responses.py +2080 -0
  228. vllm/entrypoints/openai/serving_transcription.py +168 -0
  229. vllm/entrypoints/openai/speech_to_text.py +559 -0
  230. vllm/entrypoints/openai/tool_parsers/__init__.py +33 -0
  231. vllm/entrypoints/openai/utils.py +49 -0
  232. vllm/entrypoints/pooling/__init__.py +16 -0
  233. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  234. vllm/entrypoints/pooling/classify/api_router.py +50 -0
  235. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  236. vllm/entrypoints/pooling/classify/serving.py +233 -0
  237. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  238. vllm/entrypoints/pooling/embed/api_router.py +67 -0
  239. vllm/entrypoints/pooling/embed/protocol.py +208 -0
  240. vllm/entrypoints/pooling/embed/serving.py +684 -0
  241. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  242. vllm/entrypoints/pooling/pooling/api_router.py +63 -0
  243. vllm/entrypoints/pooling/pooling/protocol.py +148 -0
  244. vllm/entrypoints/pooling/pooling/serving.py +354 -0
  245. vllm/entrypoints/pooling/score/__init__.py +0 -0
  246. vllm/entrypoints/pooling/score/api_router.py +149 -0
  247. vllm/entrypoints/pooling/score/protocol.py +146 -0
  248. vllm/entrypoints/pooling/score/serving.py +508 -0
  249. vllm/entrypoints/renderer.py +410 -0
  250. vllm/entrypoints/responses_utils.py +249 -0
  251. vllm/entrypoints/sagemaker/__init__.py +4 -0
  252. vllm/entrypoints/sagemaker/routes.py +118 -0
  253. vllm/entrypoints/score_utils.py +237 -0
  254. vllm/entrypoints/serve/__init__.py +60 -0
  255. vllm/entrypoints/serve/disagg/__init__.py +0 -0
  256. vllm/entrypoints/serve/disagg/api_router.py +110 -0
  257. vllm/entrypoints/serve/disagg/protocol.py +90 -0
  258. vllm/entrypoints/serve/disagg/serving.py +285 -0
  259. vllm/entrypoints/serve/elastic_ep/__init__.py +0 -0
  260. vllm/entrypoints/serve/elastic_ep/api_router.py +96 -0
  261. vllm/entrypoints/serve/elastic_ep/middleware.py +49 -0
  262. vllm/entrypoints/serve/instrumentator/__init__.py +0 -0
  263. vllm/entrypoints/serve/instrumentator/health.py +33 -0
  264. vllm/entrypoints/serve/instrumentator/metrics.py +45 -0
  265. vllm/entrypoints/serve/lora/__init__.py +0 -0
  266. vllm/entrypoints/serve/lora/api_router.py +70 -0
  267. vllm/entrypoints/serve/profile/__init__.py +0 -0
  268. vllm/entrypoints/serve/profile/api_router.py +46 -0
  269. vllm/entrypoints/serve/rlhf/__init__.py +0 -0
  270. vllm/entrypoints/serve/rlhf/api_router.py +102 -0
  271. vllm/entrypoints/serve/sleep/__init__.py +0 -0
  272. vllm/entrypoints/serve/sleep/api_router.py +60 -0
  273. vllm/entrypoints/serve/tokenize/__init__.py +0 -0
  274. vllm/entrypoints/serve/tokenize/api_router.py +118 -0
  275. vllm/entrypoints/serve/tokenize/serving.py +204 -0
  276. vllm/entrypoints/ssl.py +78 -0
  277. vllm/entrypoints/tool.py +187 -0
  278. vllm/entrypoints/tool_server.py +234 -0
  279. vllm/entrypoints/utils.py +319 -0
  280. vllm/env_override.py +378 -0
  281. vllm/envs.py +1744 -0
  282. vllm/forward_context.py +358 -0
  283. vllm/inputs/__init__.py +44 -0
  284. vllm/inputs/data.py +359 -0
  285. vllm/inputs/parse.py +146 -0
  286. vllm/inputs/preprocess.py +717 -0
  287. vllm/logger.py +303 -0
  288. vllm/logging_utils/__init__.py +13 -0
  289. vllm/logging_utils/dump_input.py +83 -0
  290. vllm/logging_utils/formatter.py +127 -0
  291. vllm/logging_utils/lazy.py +20 -0
  292. vllm/logging_utils/log_time.py +34 -0
  293. vllm/logits_process.py +121 -0
  294. vllm/logprobs.py +206 -0
  295. vllm/lora/__init__.py +0 -0
  296. vllm/lora/layers/__init__.py +42 -0
  297. vllm/lora/layers/base.py +66 -0
  298. vllm/lora/layers/base_linear.py +165 -0
  299. vllm/lora/layers/column_parallel_linear.py +577 -0
  300. vllm/lora/layers/fused_moe.py +747 -0
  301. vllm/lora/layers/logits_processor.py +203 -0
  302. vllm/lora/layers/replicated_linear.py +70 -0
  303. vllm/lora/layers/row_parallel_linear.py +176 -0
  304. vllm/lora/layers/utils.py +74 -0
  305. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  306. vllm/lora/lora_model.py +246 -0
  307. vllm/lora/lora_weights.py +227 -0
  308. vllm/lora/model_manager.py +690 -0
  309. vllm/lora/ops/__init__.py +0 -0
  310. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  311. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  312. vllm/lora/ops/torch_ops/__init__.py +20 -0
  313. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  314. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  315. vllm/lora/ops/triton_ops/__init__.py +21 -0
  316. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +665 -0
  317. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  318. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  319. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  320. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  321. vllm/lora/ops/triton_ops/utils.py +295 -0
  322. vllm/lora/ops/xla_ops/__init__.py +6 -0
  323. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  324. vllm/lora/peft_helper.py +128 -0
  325. vllm/lora/punica_wrapper/__init__.py +10 -0
  326. vllm/lora/punica_wrapper/punica_base.py +493 -0
  327. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  328. vllm/lora/punica_wrapper/punica_gpu.py +412 -0
  329. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  330. vllm/lora/punica_wrapper/punica_tpu.py +358 -0
  331. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  332. vllm/lora/punica_wrapper/utils.py +150 -0
  333. vllm/lora/request.py +100 -0
  334. vllm/lora/resolver.py +88 -0
  335. vllm/lora/utils.py +315 -0
  336. vllm/lora/worker_manager.py +268 -0
  337. vllm/model_executor/__init__.py +11 -0
  338. vllm/model_executor/custom_op.py +199 -0
  339. vllm/model_executor/layers/__init__.py +0 -0
  340. vllm/model_executor/layers/activation.py +595 -0
  341. vllm/model_executor/layers/attention_layer_base.py +32 -0
  342. vllm/model_executor/layers/batch_invariant.py +1067 -0
  343. vllm/model_executor/layers/conv.py +256 -0
  344. vllm/model_executor/layers/fla/__init__.py +8 -0
  345. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  346. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  347. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  348. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  349. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  350. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  351. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  352. vllm/model_executor/layers/fla/ops/index.py +41 -0
  353. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  354. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  355. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  356. vllm/model_executor/layers/fla/ops/op.py +60 -0
  357. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  358. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  359. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  360. vllm/model_executor/layers/fused_moe/__init__.py +114 -0
  361. vllm/model_executor/layers/fused_moe/all2all_utils.py +171 -0
  362. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +409 -0
  363. vllm/model_executor/layers/fused_moe/config.py +1043 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  638. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  639. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +292 -0
  640. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1453 -0
  641. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +358 -0
  642. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +427 -0
  643. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  644. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +434 -0
  645. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +376 -0
  646. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  647. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  648. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  649. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  650. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +825 -0
  651. vllm/model_executor/layers/fused_moe/fused_moe.py +2223 -0
  652. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +103 -0
  653. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +119 -0
  654. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +524 -0
  655. vllm/model_executor/layers/fused_moe/layer.py +2133 -0
  656. vllm/model_executor/layers/fused_moe/modular_kernel.py +1302 -0
  657. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +192 -0
  658. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  659. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  660. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  661. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  662. vllm/model_executor/layers/fused_moe/prepare_finalize.py +78 -0
  663. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  664. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  665. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  666. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  667. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  668. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  669. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +455 -0
  670. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  671. vllm/model_executor/layers/kda.py +442 -0
  672. vllm/model_executor/layers/layernorm.py +442 -0
  673. vllm/model_executor/layers/lightning_attn.py +735 -0
  674. vllm/model_executor/layers/linear.py +1424 -0
  675. vllm/model_executor/layers/logits_processor.py +106 -0
  676. vllm/model_executor/layers/mamba/__init__.py +0 -0
  677. vllm/model_executor/layers/mamba/abstract.py +68 -0
  678. vllm/model_executor/layers/mamba/linear_attn.py +388 -0
  679. vllm/model_executor/layers/mamba/mamba_mixer.py +526 -0
  680. vllm/model_executor/layers/mamba/mamba_mixer2.py +930 -0
  681. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  682. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  683. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  684. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  685. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +586 -0
  686. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  687. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  688. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  689. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  690. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  691. vllm/model_executor/layers/mamba/short_conv.py +255 -0
  692. vllm/model_executor/layers/mla.py +176 -0
  693. vllm/model_executor/layers/pooler.py +830 -0
  694. vllm/model_executor/layers/quantization/__init__.py +179 -0
  695. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  696. vllm/model_executor/layers/quantization/awq.py +277 -0
  697. vllm/model_executor/layers/quantization/awq_marlin.py +793 -0
  698. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  699. vllm/model_executor/layers/quantization/base_config.py +170 -0
  700. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  701. vllm/model_executor/layers/quantization/bitsandbytes.py +626 -0
  702. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  703. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +986 -0
  704. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2645 -0
  705. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  706. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  707. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  710. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  711. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +176 -0
  712. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  713. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  714. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  715. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  716. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  717. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  718. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  719. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  720. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  721. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  722. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  723. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  724. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  725. vllm/model_executor/layers/quantization/cpu_wna16.py +625 -0
  726. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  727. vllm/model_executor/layers/quantization/experts_int8.py +207 -0
  728. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  729. vllm/model_executor/layers/quantization/fp8.py +1461 -0
  730. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  731. vllm/model_executor/layers/quantization/gguf.py +677 -0
  732. vllm/model_executor/layers/quantization/gptq.py +393 -0
  733. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  734. vllm/model_executor/layers/quantization/gptq_marlin.py +932 -0
  735. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  736. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  737. vllm/model_executor/layers/quantization/inc.py +65 -0
  738. vllm/model_executor/layers/quantization/input_quant_fp8.py +202 -0
  739. vllm/model_executor/layers/quantization/ipex_quant.py +487 -0
  740. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  741. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  742. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +109 -0
  743. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  744. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  745. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  746. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +130 -0
  747. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  748. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  749. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  750. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  751. vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py +97 -0
  752. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +76 -0
  753. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +81 -0
  754. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +128 -0
  755. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +220 -0
  756. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +147 -0
  757. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +71 -0
  758. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +106 -0
  759. vllm/model_executor/layers/quantization/kv_cache.py +153 -0
  760. vllm/model_executor/layers/quantization/modelopt.py +1684 -0
  761. vllm/model_executor/layers/quantization/moe_wna16.py +516 -0
  762. vllm/model_executor/layers/quantization/mxfp4.py +1140 -0
  763. vllm/model_executor/layers/quantization/petit.py +319 -0
  764. vllm/model_executor/layers/quantization/ptpc_fp8.py +136 -0
  765. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  766. vllm/model_executor/layers/quantization/quark/quark.py +527 -0
  767. vllm/model_executor/layers/quantization/quark/quark_moe.py +622 -0
  768. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  769. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  770. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  771. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  772. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  773. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  774. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  775. vllm/model_executor/layers/quantization/rtn.py +621 -0
  776. vllm/model_executor/layers/quantization/schema.py +90 -0
  777. vllm/model_executor/layers/quantization/torchao.py +380 -0
  778. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  779. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  780. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  781. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  996. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  997. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +412 -0
  998. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +312 -0
  999. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1453 -0
  1000. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1001. vllm/model_executor/layers/quantization/utils/int8_utils.py +474 -0
  1002. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1003. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1004. vllm/model_executor/layers/quantization/utils/marlin_utils.py +678 -0
  1005. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +452 -0
  1006. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +381 -0
  1007. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1008. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1009. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +189 -0
  1010. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1011. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1012. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1013. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1014. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1015. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1016. vllm/model_executor/layers/quantization/utils/quant_utils.py +741 -0
  1017. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +519 -0
  1018. vllm/model_executor/layers/resampler.py +283 -0
  1019. vllm/model_executor/layers/rotary_embedding/__init__.py +289 -0
  1020. vllm/model_executor/layers/rotary_embedding/base.py +254 -0
  1021. vllm/model_executor/layers/rotary_embedding/common.py +279 -0
  1022. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1023. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1024. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1025. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1026. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +82 -0
  1027. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1028. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1029. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1030. vllm/model_executor/layers/rotary_embedding/mrope.py +412 -0
  1031. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1032. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1033. vllm/model_executor/layers/rotary_embedding/xdrope.py +160 -0
  1034. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1035. vllm/model_executor/layers/utils.py +251 -0
  1036. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1037. vllm/model_executor/model_loader/__init__.py +150 -0
  1038. vllm/model_executor/model_loader/base_loader.py +57 -0
  1039. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1040. vllm/model_executor/model_loader/default_loader.py +321 -0
  1041. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1042. vllm/model_executor/model_loader/gguf_loader.py +371 -0
  1043. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1044. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1045. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1046. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1047. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1048. vllm/model_executor/model_loader/tpu.py +118 -0
  1049. vllm/model_executor/model_loader/utils.py +292 -0
  1050. vllm/model_executor/model_loader/weight_utils.py +1157 -0
  1051. vllm/model_executor/models/__init__.py +44 -0
  1052. vllm/model_executor/models/adapters.py +522 -0
  1053. vllm/model_executor/models/afmoe.py +696 -0
  1054. vllm/model_executor/models/aimv2.py +248 -0
  1055. vllm/model_executor/models/apertus.py +565 -0
  1056. vllm/model_executor/models/arcee.py +428 -0
  1057. vllm/model_executor/models/arctic.py +633 -0
  1058. vllm/model_executor/models/aria.py +653 -0
  1059. vllm/model_executor/models/audioflamingo3.py +639 -0
  1060. vllm/model_executor/models/aya_vision.py +448 -0
  1061. vllm/model_executor/models/bagel.py +584 -0
  1062. vllm/model_executor/models/baichuan.py +493 -0
  1063. vllm/model_executor/models/bailing_moe.py +642 -0
  1064. vllm/model_executor/models/bamba.py +511 -0
  1065. vllm/model_executor/models/bee.py +157 -0
  1066. vllm/model_executor/models/bert.py +925 -0
  1067. vllm/model_executor/models/bert_with_rope.py +732 -0
  1068. vllm/model_executor/models/blip.py +350 -0
  1069. vllm/model_executor/models/blip2.py +693 -0
  1070. vllm/model_executor/models/bloom.py +390 -0
  1071. vllm/model_executor/models/chameleon.py +1095 -0
  1072. vllm/model_executor/models/chatglm.py +502 -0
  1073. vllm/model_executor/models/clip.py +1004 -0
  1074. vllm/model_executor/models/cohere2_vision.py +470 -0
  1075. vllm/model_executor/models/commandr.py +469 -0
  1076. vllm/model_executor/models/config.py +531 -0
  1077. vllm/model_executor/models/dbrx.py +484 -0
  1078. vllm/model_executor/models/deepencoder.py +676 -0
  1079. vllm/model_executor/models/deepseek_eagle.py +252 -0
  1080. vllm/model_executor/models/deepseek_mtp.py +446 -0
  1081. vllm/model_executor/models/deepseek_ocr.py +591 -0
  1082. vllm/model_executor/models/deepseek_v2.py +1710 -0
  1083. vllm/model_executor/models/deepseek_vl2.py +642 -0
  1084. vllm/model_executor/models/dots1.py +565 -0
  1085. vllm/model_executor/models/dots_ocr.py +821 -0
  1086. vllm/model_executor/models/ernie45.py +53 -0
  1087. vllm/model_executor/models/ernie45_moe.py +754 -0
  1088. vllm/model_executor/models/ernie45_vl.py +1621 -0
  1089. vllm/model_executor/models/ernie45_vl_moe.py +800 -0
  1090. vllm/model_executor/models/ernie_mtp.py +279 -0
  1091. vllm/model_executor/models/exaone.py +524 -0
  1092. vllm/model_executor/models/exaone4.py +516 -0
  1093. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1094. vllm/model_executor/models/falcon.py +543 -0
  1095. vllm/model_executor/models/falcon_h1.py +675 -0
  1096. vllm/model_executor/models/flex_olmo.py +155 -0
  1097. vllm/model_executor/models/fuyu.py +371 -0
  1098. vllm/model_executor/models/gemma.py +425 -0
  1099. vllm/model_executor/models/gemma2.py +435 -0
  1100. vllm/model_executor/models/gemma3.py +507 -0
  1101. vllm/model_executor/models/gemma3_mm.py +664 -0
  1102. vllm/model_executor/models/gemma3n.py +1166 -0
  1103. vllm/model_executor/models/gemma3n_mm.py +810 -0
  1104. vllm/model_executor/models/glm.py +24 -0
  1105. vllm/model_executor/models/glm4.py +295 -0
  1106. vllm/model_executor/models/glm4_1v.py +1808 -0
  1107. vllm/model_executor/models/glm4_moe.py +736 -0
  1108. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1109. vllm/model_executor/models/glm4v.py +783 -0
  1110. vllm/model_executor/models/gpt2.py +397 -0
  1111. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1112. vllm/model_executor/models/gpt_j.py +346 -0
  1113. vllm/model_executor/models/gpt_neox.py +340 -0
  1114. vllm/model_executor/models/gpt_oss.py +744 -0
  1115. vllm/model_executor/models/granite.py +475 -0
  1116. vllm/model_executor/models/granite_speech.py +912 -0
  1117. vllm/model_executor/models/granitemoe.py +560 -0
  1118. vllm/model_executor/models/granitemoehybrid.py +703 -0
  1119. vllm/model_executor/models/granitemoeshared.py +328 -0
  1120. vllm/model_executor/models/gritlm.py +243 -0
  1121. vllm/model_executor/models/grok1.py +554 -0
  1122. vllm/model_executor/models/h2ovl.py +554 -0
  1123. vllm/model_executor/models/hunyuan_v1.py +1040 -0
  1124. vllm/model_executor/models/hunyuan_vision.py +1034 -0
  1125. vllm/model_executor/models/hyperclovax_vision.py +1164 -0
  1126. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1127. vllm/model_executor/models/idefics3.py +716 -0
  1128. vllm/model_executor/models/interfaces.py +1179 -0
  1129. vllm/model_executor/models/interfaces_base.py +228 -0
  1130. vllm/model_executor/models/intern_vit.py +454 -0
  1131. vllm/model_executor/models/internlm2.py +453 -0
  1132. vllm/model_executor/models/internlm2_ve.py +139 -0
  1133. vllm/model_executor/models/interns1.py +828 -0
  1134. vllm/model_executor/models/interns1_vit.py +433 -0
  1135. vllm/model_executor/models/internvl.py +1450 -0
  1136. vllm/model_executor/models/jais.py +397 -0
  1137. vllm/model_executor/models/jais2.py +529 -0
  1138. vllm/model_executor/models/jamba.py +609 -0
  1139. vllm/model_executor/models/jina_vl.py +147 -0
  1140. vllm/model_executor/models/keye.py +1706 -0
  1141. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1142. vllm/model_executor/models/kimi_linear.py +658 -0
  1143. vllm/model_executor/models/kimi_vl.py +576 -0
  1144. vllm/model_executor/models/lfm2.py +515 -0
  1145. vllm/model_executor/models/lfm2_moe.py +745 -0
  1146. vllm/model_executor/models/lightonocr.py +195 -0
  1147. vllm/model_executor/models/llama.py +700 -0
  1148. vllm/model_executor/models/llama4.py +856 -0
  1149. vllm/model_executor/models/llama4_eagle.py +225 -0
  1150. vllm/model_executor/models/llama_eagle.py +213 -0
  1151. vllm/model_executor/models/llama_eagle3.py +375 -0
  1152. vllm/model_executor/models/llava.py +840 -0
  1153. vllm/model_executor/models/llava_next.py +581 -0
  1154. vllm/model_executor/models/llava_next_video.py +465 -0
  1155. vllm/model_executor/models/llava_onevision.py +921 -0
  1156. vllm/model_executor/models/longcat_flash.py +743 -0
  1157. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1158. vllm/model_executor/models/mamba.py +276 -0
  1159. vllm/model_executor/models/mamba2.py +288 -0
  1160. vllm/model_executor/models/medusa.py +179 -0
  1161. vllm/model_executor/models/midashenglm.py +826 -0
  1162. vllm/model_executor/models/mimo.py +188 -0
  1163. vllm/model_executor/models/mimo_mtp.py +294 -0
  1164. vllm/model_executor/models/minicpm.py +656 -0
  1165. vllm/model_executor/models/minicpm3.py +233 -0
  1166. vllm/model_executor/models/minicpm_eagle.py +385 -0
  1167. vllm/model_executor/models/minicpmo.py +768 -0
  1168. vllm/model_executor/models/minicpmv.py +1742 -0
  1169. vllm/model_executor/models/minimax_m2.py +550 -0
  1170. vllm/model_executor/models/minimax_text_01.py +1007 -0
  1171. vllm/model_executor/models/minimax_vl_01.py +394 -0
  1172. vllm/model_executor/models/mistral3.py +635 -0
  1173. vllm/model_executor/models/mistral_large_3.py +63 -0
  1174. vllm/model_executor/models/mistral_large_3_eagle.py +136 -0
  1175. vllm/model_executor/models/mixtral.py +598 -0
  1176. vllm/model_executor/models/mllama4.py +1149 -0
  1177. vllm/model_executor/models/mlp_speculator.py +235 -0
  1178. vllm/model_executor/models/modernbert.py +451 -0
  1179. vllm/model_executor/models/module_mapping.py +74 -0
  1180. vllm/model_executor/models/molmo.py +1550 -0
  1181. vllm/model_executor/models/moonvit.py +686 -0
  1182. vllm/model_executor/models/mpt.py +335 -0
  1183. vllm/model_executor/models/nano_nemotron_vl.py +1730 -0
  1184. vllm/model_executor/models/nemotron.py +499 -0
  1185. vllm/model_executor/models/nemotron_h.py +900 -0
  1186. vllm/model_executor/models/nemotron_nas.py +471 -0
  1187. vllm/model_executor/models/nemotron_vl.py +651 -0
  1188. vllm/model_executor/models/nvlm_d.py +216 -0
  1189. vllm/model_executor/models/olmo.py +412 -0
  1190. vllm/model_executor/models/olmo2.py +454 -0
  1191. vllm/model_executor/models/olmoe.py +493 -0
  1192. vllm/model_executor/models/opencua.py +262 -0
  1193. vllm/model_executor/models/openpangu.py +1049 -0
  1194. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1195. vllm/model_executor/models/opt.py +426 -0
  1196. vllm/model_executor/models/orion.py +365 -0
  1197. vllm/model_executor/models/ouro.py +507 -0
  1198. vllm/model_executor/models/ovis.py +557 -0
  1199. vllm/model_executor/models/ovis2_5.py +661 -0
  1200. vllm/model_executor/models/paddleocr_vl.py +1300 -0
  1201. vllm/model_executor/models/paligemma.py +408 -0
  1202. vllm/model_executor/models/persimmon.py +373 -0
  1203. vllm/model_executor/models/phi.py +363 -0
  1204. vllm/model_executor/models/phi3.py +18 -0
  1205. vllm/model_executor/models/phi3v.py +729 -0
  1206. vllm/model_executor/models/phi4mm.py +1251 -0
  1207. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1208. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1209. vllm/model_executor/models/phimoe.py +669 -0
  1210. vllm/model_executor/models/pixtral.py +1379 -0
  1211. vllm/model_executor/models/plamo2.py +965 -0
  1212. vllm/model_executor/models/plamo3.py +440 -0
  1213. vllm/model_executor/models/qwen.py +365 -0
  1214. vllm/model_executor/models/qwen2.py +600 -0
  1215. vllm/model_executor/models/qwen2_5_omni_thinker.py +1219 -0
  1216. vllm/model_executor/models/qwen2_5_vl.py +1569 -0
  1217. vllm/model_executor/models/qwen2_audio.py +471 -0
  1218. vllm/model_executor/models/qwen2_moe.py +597 -0
  1219. vllm/model_executor/models/qwen2_rm.py +123 -0
  1220. vllm/model_executor/models/qwen2_vl.py +1568 -0
  1221. vllm/model_executor/models/qwen3.py +331 -0
  1222. vllm/model_executor/models/qwen3_moe.py +751 -0
  1223. vllm/model_executor/models/qwen3_next.py +1395 -0
  1224. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1225. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1793 -0
  1226. vllm/model_executor/models/qwen3_vl.py +2092 -0
  1227. vllm/model_executor/models/qwen3_vl_moe.py +474 -0
  1228. vllm/model_executor/models/qwen_vl.py +801 -0
  1229. vllm/model_executor/models/radio.py +555 -0
  1230. vllm/model_executor/models/registry.py +1189 -0
  1231. vllm/model_executor/models/roberta.py +259 -0
  1232. vllm/model_executor/models/rvl.py +107 -0
  1233. vllm/model_executor/models/seed_oss.py +492 -0
  1234. vllm/model_executor/models/siglip.py +1244 -0
  1235. vllm/model_executor/models/siglip2navit.py +658 -0
  1236. vllm/model_executor/models/skyworkr1v.py +951 -0
  1237. vllm/model_executor/models/smolvlm.py +38 -0
  1238. vllm/model_executor/models/solar.py +484 -0
  1239. vllm/model_executor/models/stablelm.py +354 -0
  1240. vllm/model_executor/models/starcoder2.py +365 -0
  1241. vllm/model_executor/models/step3_text.py +554 -0
  1242. vllm/model_executor/models/step3_vl.py +1147 -0
  1243. vllm/model_executor/models/swin.py +514 -0
  1244. vllm/model_executor/models/tarsier.py +617 -0
  1245. vllm/model_executor/models/telechat2.py +153 -0
  1246. vllm/model_executor/models/teleflm.py +78 -0
  1247. vllm/model_executor/models/terratorch.py +318 -0
  1248. vllm/model_executor/models/transformers/__init__.py +127 -0
  1249. vllm/model_executor/models/transformers/base.py +518 -0
  1250. vllm/model_executor/models/transformers/causal.py +65 -0
  1251. vllm/model_executor/models/transformers/legacy.py +90 -0
  1252. vllm/model_executor/models/transformers/moe.py +325 -0
  1253. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1254. vllm/model_executor/models/transformers/pooling.py +119 -0
  1255. vllm/model_executor/models/transformers/utils.py +213 -0
  1256. vllm/model_executor/models/ultravox.py +766 -0
  1257. vllm/model_executor/models/utils.py +832 -0
  1258. vllm/model_executor/models/vision.py +546 -0
  1259. vllm/model_executor/models/voxtral.py +841 -0
  1260. vllm/model_executor/models/whisper.py +971 -0
  1261. vllm/model_executor/models/zamba2.py +979 -0
  1262. vllm/model_executor/parameter.py +642 -0
  1263. vllm/model_executor/utils.py +119 -0
  1264. vllm/model_executor/warmup/__init__.py +0 -0
  1265. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1266. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1267. vllm/multimodal/__init__.py +40 -0
  1268. vllm/multimodal/audio.py +147 -0
  1269. vllm/multimodal/base.py +56 -0
  1270. vllm/multimodal/cache.py +823 -0
  1271. vllm/multimodal/evs.py +294 -0
  1272. vllm/multimodal/hasher.py +120 -0
  1273. vllm/multimodal/image.py +142 -0
  1274. vllm/multimodal/inputs.py +1089 -0
  1275. vllm/multimodal/parse.py +565 -0
  1276. vllm/multimodal/processing.py +2240 -0
  1277. vllm/multimodal/profiling.py +351 -0
  1278. vllm/multimodal/registry.py +357 -0
  1279. vllm/multimodal/utils.py +513 -0
  1280. vllm/multimodal/video.py +340 -0
  1281. vllm/outputs.py +345 -0
  1282. vllm/platforms/__init__.py +277 -0
  1283. vllm/platforms/cpu.py +421 -0
  1284. vllm/platforms/cuda.py +618 -0
  1285. vllm/platforms/interface.py +695 -0
  1286. vllm/platforms/rocm.py +564 -0
  1287. vllm/platforms/tpu.py +295 -0
  1288. vllm/platforms/xpu.py +277 -0
  1289. vllm/plugins/__init__.py +81 -0
  1290. vllm/plugins/io_processors/__init__.py +68 -0
  1291. vllm/plugins/io_processors/interface.py +77 -0
  1292. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1293. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1294. vllm/pooling_params.py +230 -0
  1295. vllm/profiler/__init__.py +0 -0
  1296. vllm/profiler/layerwise_profile.py +392 -0
  1297. vllm/profiler/utils.py +151 -0
  1298. vllm/profiler/wrapper.py +241 -0
  1299. vllm/py.typed +2 -0
  1300. vllm/ray/__init__.py +0 -0
  1301. vllm/ray/lazy_utils.py +30 -0
  1302. vllm/ray/ray_env.py +79 -0
  1303. vllm/reasoning/__init__.py +96 -0
  1304. vllm/reasoning/abs_reasoning_parsers.py +318 -0
  1305. vllm/reasoning/basic_parsers.py +175 -0
  1306. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1307. vllm/reasoning/deepseek_v3_reasoning_parser.py +67 -0
  1308. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1309. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1310. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1311. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1312. vllm/reasoning/holo2_reasoning_parser.py +88 -0
  1313. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1314. vllm/reasoning/identity_reasoning_parser.py +63 -0
  1315. vllm/reasoning/minimax_m2_reasoning_parser.py +110 -0
  1316. vllm/reasoning/mistral_reasoning_parser.py +154 -0
  1317. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1318. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1319. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1320. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1321. vllm/sampling_params.py +597 -0
  1322. vllm/scalar_type.py +355 -0
  1323. vllm/scripts.py +17 -0
  1324. vllm/sequence.py +98 -0
  1325. vllm/tasks.py +13 -0
  1326. vllm/third_party/__init__.py +0 -0
  1327. vllm/third_party/pynvml.py +6140 -0
  1328. vllm/tokenizers/__init__.py +20 -0
  1329. vllm/tokenizers/deepseek_v32.py +175 -0
  1330. vllm/tokenizers/deepseek_v32_encoding.py +459 -0
  1331. vllm/tokenizers/detokenizer_utils.py +198 -0
  1332. vllm/tokenizers/hf.py +119 -0
  1333. vllm/tokenizers/mistral.py +567 -0
  1334. vllm/tokenizers/protocol.py +114 -0
  1335. vllm/tokenizers/registry.py +233 -0
  1336. vllm/tool_parsers/__init__.py +150 -0
  1337. vllm/tool_parsers/abstract_tool_parser.py +273 -0
  1338. vllm/tool_parsers/deepseekv31_tool_parser.py +388 -0
  1339. vllm/tool_parsers/deepseekv32_tool_parser.py +591 -0
  1340. vllm/tool_parsers/deepseekv3_tool_parser.py +390 -0
  1341. vllm/tool_parsers/ernie45_tool_parser.py +210 -0
  1342. vllm/tool_parsers/gigachat3_tool_parser.py +190 -0
  1343. vllm/tool_parsers/glm4_moe_tool_parser.py +200 -0
  1344. vllm/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  1345. vllm/tool_parsers/granite_tool_parser.py +253 -0
  1346. vllm/tool_parsers/hermes_tool_parser.py +495 -0
  1347. vllm/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  1348. vllm/tool_parsers/internlm2_tool_parser.py +227 -0
  1349. vllm/tool_parsers/jamba_tool_parser.py +323 -0
  1350. vllm/tool_parsers/kimi_k2_tool_parser.py +590 -0
  1351. vllm/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  1352. vllm/tool_parsers/llama_tool_parser.py +324 -0
  1353. vllm/tool_parsers/longcat_tool_parser.py +37 -0
  1354. vllm/tool_parsers/minimax_m2_tool_parser.py +643 -0
  1355. vllm/tool_parsers/minimax_tool_parser.py +849 -0
  1356. vllm/tool_parsers/mistral_tool_parser.py +585 -0
  1357. vllm/tool_parsers/olmo3_tool_parser.py +366 -0
  1358. vllm/tool_parsers/openai_tool_parser.py +102 -0
  1359. vllm/tool_parsers/phi4mini_tool_parser.py +120 -0
  1360. vllm/tool_parsers/pythonic_tool_parser.py +332 -0
  1361. vllm/tool_parsers/qwen3coder_tool_parser.py +781 -0
  1362. vllm/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  1363. vllm/tool_parsers/seed_oss_tool_parser.py +744 -0
  1364. vllm/tool_parsers/step3_tool_parser.py +303 -0
  1365. vllm/tool_parsers/utils.py +229 -0
  1366. vllm/tool_parsers/xlam_tool_parser.py +556 -0
  1367. vllm/tracing.py +135 -0
  1368. vllm/transformers_utils/__init__.py +26 -0
  1369. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1370. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1371. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1372. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1373. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1374. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1375. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1376. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1377. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1378. vllm/transformers_utils/config.py +1144 -0
  1379. vllm/transformers_utils/config_parser_base.py +20 -0
  1380. vllm/transformers_utils/configs/__init__.py +102 -0
  1381. vllm/transformers_utils/configs/afmoe.py +87 -0
  1382. vllm/transformers_utils/configs/arctic.py +216 -0
  1383. vllm/transformers_utils/configs/bagel.py +53 -0
  1384. vllm/transformers_utils/configs/chatglm.py +75 -0
  1385. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1386. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1387. vllm/transformers_utils/configs/eagle.py +90 -0
  1388. vllm/transformers_utils/configs/falcon.py +89 -0
  1389. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1390. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1391. vllm/transformers_utils/configs/jais.py +243 -0
  1392. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1393. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1394. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1395. vllm/transformers_utils/configs/medusa.py +65 -0
  1396. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1397. vllm/transformers_utils/configs/mistral.py +235 -0
  1398. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1399. vllm/transformers_utils/configs/moonvit.py +33 -0
  1400. vllm/transformers_utils/configs/nemotron.py +220 -0
  1401. vllm/transformers_utils/configs/nemotron_h.py +284 -0
  1402. vllm/transformers_utils/configs/olmo3.py +83 -0
  1403. vllm/transformers_utils/configs/ovis.py +182 -0
  1404. vllm/transformers_utils/configs/qwen3_next.py +277 -0
  1405. vllm/transformers_utils/configs/radio.py +89 -0
  1406. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1407. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1408. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1409. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1410. vllm/transformers_utils/configs/tarsier2.py +24 -0
  1411. vllm/transformers_utils/configs/ultravox.py +120 -0
  1412. vllm/transformers_utils/dynamic_module.py +59 -0
  1413. vllm/transformers_utils/gguf_utils.py +280 -0
  1414. vllm/transformers_utils/processor.py +424 -0
  1415. vllm/transformers_utils/processors/__init__.py +25 -0
  1416. vllm/transformers_utils/processors/bagel.py +73 -0
  1417. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1418. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1419. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1420. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1421. vllm/transformers_utils/processors/ovis.py +453 -0
  1422. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1423. vllm/transformers_utils/repo_utils.py +287 -0
  1424. vllm/transformers_utils/runai_utils.py +102 -0
  1425. vllm/transformers_utils/s3_utils.py +95 -0
  1426. vllm/transformers_utils/tokenizer.py +127 -0
  1427. vllm/transformers_utils/tokenizer_base.py +33 -0
  1428. vllm/transformers_utils/utils.py +112 -0
  1429. vllm/triton_utils/__init__.py +20 -0
  1430. vllm/triton_utils/importing.py +103 -0
  1431. vllm/usage/__init__.py +0 -0
  1432. vllm/usage/usage_lib.py +294 -0
  1433. vllm/utils/__init__.py +66 -0
  1434. vllm/utils/argparse_utils.py +492 -0
  1435. vllm/utils/async_utils.py +310 -0
  1436. vllm/utils/cache.py +214 -0
  1437. vllm/utils/collection_utils.py +112 -0
  1438. vllm/utils/counter.py +45 -0
  1439. vllm/utils/deep_gemm.py +400 -0
  1440. vllm/utils/flashinfer.py +528 -0
  1441. vllm/utils/func_utils.py +236 -0
  1442. vllm/utils/gc_utils.py +151 -0
  1443. vllm/utils/hashing.py +117 -0
  1444. vllm/utils/import_utils.py +449 -0
  1445. vllm/utils/jsontree.py +158 -0
  1446. vllm/utils/math_utils.py +32 -0
  1447. vllm/utils/mem_constants.py +13 -0
  1448. vllm/utils/mem_utils.py +232 -0
  1449. vllm/utils/nccl.py +64 -0
  1450. vllm/utils/network_utils.py +331 -0
  1451. vllm/utils/nvtx_pytorch_hooks.py +286 -0
  1452. vllm/utils/platform_utils.py +59 -0
  1453. vllm/utils/profiling.py +56 -0
  1454. vllm/utils/registry.py +51 -0
  1455. vllm/utils/serial_utils.py +214 -0
  1456. vllm/utils/system_utils.py +269 -0
  1457. vllm/utils/tensor_schema.py +255 -0
  1458. vllm/utils/torch_utils.py +648 -0
  1459. vllm/v1/__init__.py +0 -0
  1460. vllm/v1/attention/__init__.py +0 -0
  1461. vllm/v1/attention/backends/__init__.py +0 -0
  1462. vllm/v1/attention/backends/cpu_attn.py +497 -0
  1463. vllm/v1/attention/backends/flash_attn.py +1051 -0
  1464. vllm/v1/attention/backends/flashinfer.py +1575 -0
  1465. vllm/v1/attention/backends/flex_attention.py +1028 -0
  1466. vllm/v1/attention/backends/gdn_attn.py +375 -0
  1467. vllm/v1/attention/backends/linear_attn.py +77 -0
  1468. vllm/v1/attention/backends/mamba1_attn.py +159 -0
  1469. vllm/v1/attention/backends/mamba2_attn.py +348 -0
  1470. vllm/v1/attention/backends/mamba_attn.py +117 -0
  1471. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1472. vllm/v1/attention/backends/mla/aiter_triton_mla.py +74 -0
  1473. vllm/v1/attention/backends/mla/common.py +2114 -0
  1474. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1475. vllm/v1/attention/backends/mla/flashattn_mla.py +342 -0
  1476. vllm/v1/attention/backends/mla/flashinfer_mla.py +174 -0
  1477. vllm/v1/attention/backends/mla/flashmla.py +317 -0
  1478. vllm/v1/attention/backends/mla/flashmla_sparse.py +1020 -0
  1479. vllm/v1/attention/backends/mla/indexer.py +345 -0
  1480. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +275 -0
  1481. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +325 -0
  1482. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1483. vllm/v1/attention/backends/pallas.py +436 -0
  1484. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1485. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1486. vllm/v1/attention/backends/rocm_attn.py +359 -0
  1487. vllm/v1/attention/backends/short_conv_attn.py +104 -0
  1488. vllm/v1/attention/backends/tree_attn.py +428 -0
  1489. vllm/v1/attention/backends/triton_attn.py +497 -0
  1490. vllm/v1/attention/backends/utils.py +1212 -0
  1491. vllm/v1/core/__init__.py +0 -0
  1492. vllm/v1/core/block_pool.py +485 -0
  1493. vllm/v1/core/encoder_cache_manager.py +402 -0
  1494. vllm/v1/core/kv_cache_coordinator.py +570 -0
  1495. vllm/v1/core/kv_cache_manager.py +419 -0
  1496. vllm/v1/core/kv_cache_metrics.py +96 -0
  1497. vllm/v1/core/kv_cache_utils.py +1476 -0
  1498. vllm/v1/core/sched/__init__.py +0 -0
  1499. vllm/v1/core/sched/async_scheduler.py +68 -0
  1500. vllm/v1/core/sched/interface.py +189 -0
  1501. vllm/v1/core/sched/output.py +230 -0
  1502. vllm/v1/core/sched/request_queue.py +217 -0
  1503. vllm/v1/core/sched/scheduler.py +1826 -0
  1504. vllm/v1/core/sched/utils.py +64 -0
  1505. vllm/v1/core/single_type_kv_cache_manager.py +801 -0
  1506. vllm/v1/cudagraph_dispatcher.py +183 -0
  1507. vllm/v1/engine/__init__.py +217 -0
  1508. vllm/v1/engine/async_llm.py +866 -0
  1509. vllm/v1/engine/coordinator.py +377 -0
  1510. vllm/v1/engine/core.py +1455 -0
  1511. vllm/v1/engine/core_client.py +1416 -0
  1512. vllm/v1/engine/detokenizer.py +351 -0
  1513. vllm/v1/engine/exceptions.py +18 -0
  1514. vllm/v1/engine/input_processor.py +643 -0
  1515. vllm/v1/engine/llm_engine.py +414 -0
  1516. vllm/v1/engine/logprobs.py +189 -0
  1517. vllm/v1/engine/output_processor.py +659 -0
  1518. vllm/v1/engine/parallel_sampling.py +145 -0
  1519. vllm/v1/engine/processor.py +20 -0
  1520. vllm/v1/engine/utils.py +1068 -0
  1521. vllm/v1/executor/__init__.py +6 -0
  1522. vllm/v1/executor/abstract.py +352 -0
  1523. vllm/v1/executor/multiproc_executor.py +890 -0
  1524. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1525. vllm/v1/executor/ray_executor.py +626 -0
  1526. vllm/v1/executor/ray_utils.py +465 -0
  1527. vllm/v1/executor/uniproc_executor.py +186 -0
  1528. vllm/v1/kv_cache_interface.py +404 -0
  1529. vllm/v1/kv_offload/__init__.py +0 -0
  1530. vllm/v1/kv_offload/abstract.py +161 -0
  1531. vllm/v1/kv_offload/arc_manager.py +237 -0
  1532. vllm/v1/kv_offload/backend.py +97 -0
  1533. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1534. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1535. vllm/v1/kv_offload/cpu.py +86 -0
  1536. vllm/v1/kv_offload/factory.py +56 -0
  1537. vllm/v1/kv_offload/lru_manager.py +139 -0
  1538. vllm/v1/kv_offload/mediums.py +39 -0
  1539. vllm/v1/kv_offload/spec.py +66 -0
  1540. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1541. vllm/v1/kv_offload/worker/cpu_gpu.py +280 -0
  1542. vllm/v1/kv_offload/worker/worker.py +144 -0
  1543. vllm/v1/metrics/__init__.py +0 -0
  1544. vllm/v1/metrics/loggers.py +1305 -0
  1545. vllm/v1/metrics/prometheus.py +82 -0
  1546. vllm/v1/metrics/ray_wrappers.py +194 -0
  1547. vllm/v1/metrics/reader.py +257 -0
  1548. vllm/v1/metrics/stats.py +437 -0
  1549. vllm/v1/outputs.py +245 -0
  1550. vllm/v1/pool/__init__.py +0 -0
  1551. vllm/v1/pool/metadata.py +126 -0
  1552. vllm/v1/request.py +282 -0
  1553. vllm/v1/sample/__init__.py +0 -0
  1554. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1555. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1556. vllm/v1/sample/logits_processor/interface.py +106 -0
  1557. vllm/v1/sample/logits_processor/state.py +165 -0
  1558. vllm/v1/sample/metadata.py +44 -0
  1559. vllm/v1/sample/ops/__init__.py +0 -0
  1560. vllm/v1/sample/ops/bad_words.py +52 -0
  1561. vllm/v1/sample/ops/logprobs.py +25 -0
  1562. vllm/v1/sample/ops/penalties.py +57 -0
  1563. vllm/v1/sample/ops/topk_topp_sampler.py +384 -0
  1564. vllm/v1/sample/rejection_sampler.py +805 -0
  1565. vllm/v1/sample/sampler.py +319 -0
  1566. vllm/v1/sample/tpu/__init__.py +0 -0
  1567. vllm/v1/sample/tpu/metadata.py +120 -0
  1568. vllm/v1/sample/tpu/sampler.py +215 -0
  1569. vllm/v1/serial_utils.py +514 -0
  1570. vllm/v1/spec_decode/__init__.py +0 -0
  1571. vllm/v1/spec_decode/eagle.py +1331 -0
  1572. vllm/v1/spec_decode/medusa.py +73 -0
  1573. vllm/v1/spec_decode/metadata.py +66 -0
  1574. vllm/v1/spec_decode/metrics.py +225 -0
  1575. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1576. vllm/v1/spec_decode/suffix_decoding.py +101 -0
  1577. vllm/v1/spec_decode/utils.py +121 -0
  1578. vllm/v1/structured_output/__init__.py +353 -0
  1579. vllm/v1/structured_output/backend_guidance.py +265 -0
  1580. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1581. vllm/v1/structured_output/backend_outlines.py +324 -0
  1582. vllm/v1/structured_output/backend_types.py +136 -0
  1583. vllm/v1/structured_output/backend_xgrammar.py +378 -0
  1584. vllm/v1/structured_output/request.py +94 -0
  1585. vllm/v1/structured_output/utils.py +469 -0
  1586. vllm/v1/utils.py +414 -0
  1587. vllm/v1/worker/__init__.py +0 -0
  1588. vllm/v1/worker/block_table.py +343 -0
  1589. vllm/v1/worker/cp_utils.py +42 -0
  1590. vllm/v1/worker/cpu_model_runner.py +122 -0
  1591. vllm/v1/worker/cpu_worker.py +192 -0
  1592. vllm/v1/worker/dp_utils.py +240 -0
  1593. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1594. vllm/v1/worker/gpu/README.md +4 -0
  1595. vllm/v1/worker/gpu/__init__.py +0 -0
  1596. vllm/v1/worker/gpu/async_utils.py +98 -0
  1597. vllm/v1/worker/gpu/attn_utils.py +189 -0
  1598. vllm/v1/worker/gpu/block_table.py +314 -0
  1599. vllm/v1/worker/gpu/cudagraph_utils.py +259 -0
  1600. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1601. vllm/v1/worker/gpu/input_batch.py +479 -0
  1602. vllm/v1/worker/gpu/metrics/__init__.py +0 -0
  1603. vllm/v1/worker/gpu/metrics/logits.py +42 -0
  1604. vllm/v1/worker/gpu/model_runner.py +1006 -0
  1605. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1606. vllm/v1/worker/gpu/sample/gumbel.py +101 -0
  1607. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1608. vllm/v1/worker/gpu/sample/metadata.py +192 -0
  1609. vllm/v1/worker/gpu/sample/min_p.py +51 -0
  1610. vllm/v1/worker/gpu/sample/output.py +14 -0
  1611. vllm/v1/worker/gpu/sample/penalties.py +155 -0
  1612. vllm/v1/worker/gpu/sample/sampler.py +87 -0
  1613. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1614. vllm/v1/worker/gpu/spec_decode/eagle.py +565 -0
  1615. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1616. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +71 -0
  1617. vllm/v1/worker/gpu/states.py +316 -0
  1618. vllm/v1/worker/gpu/structured_outputs.py +76 -0
  1619. vllm/v1/worker/gpu_input_batch.py +990 -0
  1620. vllm/v1/worker/gpu_model_runner.py +5470 -0
  1621. vllm/v1/worker/gpu_ubatch_wrapper.py +472 -0
  1622. vllm/v1/worker/gpu_worker.py +955 -0
  1623. vllm/v1/worker/kv_connector_model_runner_mixin.py +302 -0
  1624. vllm/v1/worker/lora_model_runner_mixin.py +212 -0
  1625. vllm/v1/worker/tpu_input_batch.py +583 -0
  1626. vllm/v1/worker/tpu_model_runner.py +2191 -0
  1627. vllm/v1/worker/tpu_worker.py +352 -0
  1628. vllm/v1/worker/ubatch_utils.py +109 -0
  1629. vllm/v1/worker/ubatching.py +231 -0
  1630. vllm/v1/worker/utils.py +375 -0
  1631. vllm/v1/worker/worker_base.py +377 -0
  1632. vllm/v1/worker/workspace.py +253 -0
  1633. vllm/v1/worker/xpu_model_runner.py +48 -0
  1634. vllm/v1/worker/xpu_worker.py +174 -0
  1635. vllm/version.py +39 -0
  1636. vllm/vllm_flash_attn/.gitkeep +0 -0
  1637. vllm_cpu_avx512vnni-0.13.0.dist-info/METADATA +339 -0
  1638. vllm_cpu_avx512vnni-0.13.0.dist-info/RECORD +1641 -0
  1639. vllm_cpu_avx512vnni-0.13.0.dist-info/WHEEL +5 -0
  1640. vllm_cpu_avx512vnni-0.13.0.dist-info/entry_points.txt +5 -0
  1641. vllm_cpu_avx512vnni-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2068 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import argparse
5
+ import copy
6
+ import dataclasses
7
+ import functools
8
+ import json
9
+ import sys
10
+ from collections.abc import Callable
11
+ from dataclasses import MISSING, dataclass, fields, is_dataclass
12
+ from itertools import permutations
13
+ from types import UnionType
14
+ from typing import (
15
+ TYPE_CHECKING,
16
+ Annotated,
17
+ Any,
18
+ Literal,
19
+ TypeAlias,
20
+ TypeVar,
21
+ Union,
22
+ cast,
23
+ get_args,
24
+ get_origin,
25
+ )
26
+
27
+ import huggingface_hub
28
+ import regex as re
29
+ import torch
30
+ from pydantic import TypeAdapter, ValidationError
31
+ from pydantic.fields import FieldInfo
32
+ from typing_extensions import TypeIs
33
+
34
+ import vllm.envs as envs
35
+ from vllm.attention.backends.registry import AttentionBackendEnum
36
+ from vllm.config import (
37
+ AttentionConfig,
38
+ CacheConfig,
39
+ CompilationConfig,
40
+ ConfigType,
41
+ DeviceConfig,
42
+ ECTransferConfig,
43
+ EPLBConfig,
44
+ KVEventsConfig,
45
+ KVTransferConfig,
46
+ LoadConfig,
47
+ LoRAConfig,
48
+ ModelConfig,
49
+ MultiModalConfig,
50
+ ObservabilityConfig,
51
+ ParallelConfig,
52
+ PoolerConfig,
53
+ ProfilerConfig,
54
+ SchedulerConfig,
55
+ SpeculativeConfig,
56
+ StructuredOutputsConfig,
57
+ VllmConfig,
58
+ get_attr_docs,
59
+ )
60
+ from vllm.config.cache import (
61
+ BlockSize,
62
+ CacheDType,
63
+ KVOffloadingBackend,
64
+ MambaDType,
65
+ PrefixCachingHashAlgo,
66
+ )
67
+ from vllm.config.device import Device
68
+ from vllm.config.model import (
69
+ ConvertOption,
70
+ HfOverrides,
71
+ LogprobsMode,
72
+ ModelDType,
73
+ RunnerOption,
74
+ TokenizerMode,
75
+ )
76
+ from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
77
+ from vllm.config.observability import DetailedTraceModules
78
+ from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
79
+ from vllm.config.scheduler import SchedulerPolicy
80
+ from vllm.config.utils import get_field
81
+ from vllm.config.vllm import OptimizationLevel
82
+ from vllm.logger import init_logger, suppress_logging
83
+ from vllm.platforms import CpuArchEnum, current_platform
84
+ from vllm.plugins import load_general_plugins
85
+ from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
86
+ from vllm.transformers_utils.config import (
87
+ is_interleaved,
88
+ maybe_override_with_speculators,
89
+ )
90
+ from vllm.transformers_utils.gguf_utils import is_gguf
91
+ from vllm.transformers_utils.repo_utils import get_model_path
92
+ from vllm.transformers_utils.utils import is_cloud_storage
93
+ from vllm.utils.argparse_utils import FlexibleArgumentParser
94
+ from vllm.utils.mem_constants import GiB_bytes
95
+ from vllm.utils.network_utils import get_ip
96
+ from vllm.v1.sample.logits_processor import LogitsProcessor
97
+
98
+ if TYPE_CHECKING:
99
+ from vllm.model_executor.layers.quantization import QuantizationMethods
100
+ from vllm.model_executor.model_loader import LoadFormats
101
+ from vllm.usage.usage_lib import UsageContext
102
+ from vllm.v1.executor import Executor
103
+ else:
104
+ Executor = Any
105
+ QuantizationMethods = Any
106
+ LoadFormats = Any
107
+ UsageContext = Any
108
+
109
+ logger = init_logger(__name__)
110
+
111
+ # object is used to allow for special typing forms
112
+ T = TypeVar("T")
113
+ TypeHint: TypeAlias = type[Any] | object
114
+ TypeHintT: TypeAlias = type[T] | object
115
+
116
+
117
+ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
118
+ def _parse_type(val: str) -> T:
119
+ try:
120
+ return return_type(val)
121
+ except ValueError as e:
122
+ raise argparse.ArgumentTypeError(
123
+ f"Value {val} cannot be converted to {return_type}."
124
+ ) from e
125
+
126
+ return _parse_type
127
+
128
+
129
+ def optional_type(return_type: Callable[[str], T]) -> Callable[[str], T | None]:
130
+ def _optional_type(val: str) -> T | None:
131
+ if val == "" or val == "None":
132
+ return None
133
+ return parse_type(return_type)(val)
134
+
135
+ return _optional_type
136
+
137
+
138
+ def union_dict_and_str(val: str) -> str | dict[str, str] | None:
139
+ if not re.match(r"(?s)^\s*{.*}\s*$", val):
140
+ return str(val)
141
+ return optional_type(json.loads)(val)
142
+
143
+
144
+ def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
145
+ """Check if the type hint is a specific type."""
146
+ return type_hint is type or get_origin(type_hint) is type
147
+
148
+
149
+ def contains_type(type_hints: set[TypeHint], type: TypeHintT) -> bool:
150
+ """Check if the type hints contain a specific type."""
151
+ return any(is_type(type_hint, type) for type_hint in type_hints)
152
+
153
+
154
+ def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT:
155
+ """Get the specific type from the type hints."""
156
+ return next((th for th in type_hints if is_type(th, type)), None)
157
+
158
+
159
+ def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]:
160
+ """Get the `type` and `choices` from a `Literal` type hint in `type_hints`.
161
+
162
+ If `type_hints` also contains `str`, we use `metavar` instead of `choices`.
163
+ """
164
+ type_hint = get_type(type_hints, Literal)
165
+ options = get_args(type_hint)
166
+ option_type = type(options[0])
167
+ if not all(isinstance(option, option_type) for option in options):
168
+ raise ValueError(
169
+ "All options must be of the same type. "
170
+ f"Got {options} with types {[type(c) for c in options]}"
171
+ )
172
+ kwarg = "metavar" if contains_type(type_hints, str) else "choices"
173
+ return {"type": option_type, kwarg: sorted(options)}
174
+
175
+
176
+ def collection_to_kwargs(type_hints: set[TypeHint], type: TypeHint) -> dict[str, Any]:
177
+ type_hint = get_type(type_hints, type)
178
+ types = get_args(type_hint)
179
+ elem_type = types[0]
180
+
181
+ # Handle Ellipsis
182
+ assert all(t is elem_type for t in types if t is not Ellipsis), (
183
+ f"All non-Ellipsis elements must be of the same type. Got {types}."
184
+ )
185
+
186
+ # Handle Union types
187
+ if get_origin(elem_type) in {Union, UnionType}:
188
+ # Union for Union[X, Y] and UnionType for X | Y
189
+ assert str in get_args(elem_type), (
190
+ "If element can have multiple types, one must be 'str' "
191
+ f"(i.e. 'list[int | str]'). Got {elem_type}."
192
+ )
193
+ elem_type = str
194
+
195
+ return {
196
+ "type": elem_type,
197
+ "nargs": "+" if type is not tuple or Ellipsis in types else len(types),
198
+ }
199
+
200
+
201
+ def is_not_builtin(type_hint: TypeHint) -> bool:
202
+ """Check if the class is not a built-in type."""
203
+ return type_hint.__module__ != "builtins"
204
+
205
+
206
+ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
207
+ """Extract type hints from Annotated or Union type hints."""
208
+ type_hints: set[TypeHint] = set()
209
+ origin = get_origin(type_hint)
210
+ args = get_args(type_hint)
211
+
212
+ if origin is Annotated:
213
+ type_hints.update(get_type_hints(args[0]))
214
+ elif origin in {Union, UnionType}:
215
+ # Union for Union[X, Y] and UnionType for X | Y
216
+ for arg in args:
217
+ type_hints.update(get_type_hints(arg))
218
+ else:
219
+ type_hints.add(type_hint)
220
+
221
+ return type_hints
222
+
223
+
224
+ def is_online_quantization(quantization: Any) -> bool:
225
+ return quantization in ["inc"]
226
+
227
+
228
+ NEEDS_HELP = (
229
+ any("--help" in arg for arg in sys.argv) # vllm SUBCOMMAND --help
230
+ or (argv0 := sys.argv[0]).endswith("mkdocs") # mkdocs SUBCOMMAND
231
+ or argv0.endswith("mkdocs/__main__.py") # python -m mkdocs SUBCOMMAND
232
+ )
233
+
234
+
235
+ @functools.lru_cache(maxsize=30)
236
+ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
237
+ # Save time only getting attr docs if we're generating help text
238
+ cls_docs = get_attr_docs(cls) if NEEDS_HELP else {}
239
+ kwargs = {}
240
+ for field in fields(cls):
241
+ # Get the set of possible types for the field
242
+ type_hints: set[TypeHint] = get_type_hints(field.type)
243
+
244
+ # If the field is a dataclass, we can use the model_validate_json
245
+ generator = (th for th in type_hints if is_dataclass(th))
246
+ dataclass_cls = next(generator, None)
247
+
248
+ # Get the default value of the field
249
+ if field.default is not MISSING:
250
+ default = field.default
251
+ # Handle pydantic.Field defaults
252
+ if isinstance(default, FieldInfo):
253
+ if default.default_factory is None:
254
+ default = default.default
255
+ else:
256
+ # VllmConfig's Fields have default_factory set to config classes.
257
+ # These could emit logs on init, which would be confusing.
258
+ with suppress_logging():
259
+ default = default.default_factory()
260
+ elif field.default_factory is not MISSING:
261
+ default = field.default_factory()
262
+
263
+ # Get the help text for the field
264
+ name = field.name
265
+ help = cls_docs.get(name, "").strip()
266
+ # Escape % for argparse
267
+ help = help.replace("%", "%%")
268
+
269
+ # Initialise the kwargs dictionary for the field
270
+ kwargs[name] = {"default": default, "help": help}
271
+
272
+ # Set other kwargs based on the type hints
273
+ json_tip = (
274
+ "Should either be a valid JSON string or JSON keys passed individually."
275
+ )
276
+ if dataclass_cls is not None:
277
+
278
+ def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
279
+ try:
280
+ return TypeAdapter(cls).validate_json(val)
281
+ except ValidationError as e:
282
+ raise argparse.ArgumentTypeError(repr(e)) from e
283
+
284
+ kwargs[name]["type"] = parse_dataclass
285
+ kwargs[name]["help"] += f"\n\n{json_tip}"
286
+ elif contains_type(type_hints, bool):
287
+ # Creates --no-<name> and --<name> flags
288
+ kwargs[name]["action"] = argparse.BooleanOptionalAction
289
+ elif contains_type(type_hints, Literal):
290
+ kwargs[name].update(literal_to_kwargs(type_hints))
291
+ elif contains_type(type_hints, tuple):
292
+ kwargs[name].update(collection_to_kwargs(type_hints, tuple))
293
+ elif contains_type(type_hints, list):
294
+ kwargs[name].update(collection_to_kwargs(type_hints, list))
295
+ elif contains_type(type_hints, set):
296
+ kwargs[name].update(collection_to_kwargs(type_hints, set))
297
+ elif contains_type(type_hints, int):
298
+ kwargs[name]["type"] = int
299
+ # Special case for large integers
300
+ human_readable_ints = {
301
+ "max_model_len",
302
+ "max_num_batched_tokens",
303
+ "kv_cache_memory_bytes",
304
+ }
305
+ if name in human_readable_ints:
306
+ kwargs[name]["type"] = human_readable_int
307
+ kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
308
+ elif contains_type(type_hints, float):
309
+ kwargs[name]["type"] = float
310
+ elif contains_type(type_hints, dict) and (
311
+ contains_type(type_hints, str)
312
+ or any(is_not_builtin(th) for th in type_hints)
313
+ ):
314
+ kwargs[name]["type"] = union_dict_and_str
315
+ elif contains_type(type_hints, dict):
316
+ kwargs[name]["type"] = parse_type(json.loads)
317
+ kwargs[name]["help"] += f"\n\n{json_tip}"
318
+ elif contains_type(type_hints, str) or any(
319
+ is_not_builtin(th) for th in type_hints
320
+ ):
321
+ kwargs[name]["type"] = str
322
+ else:
323
+ raise ValueError(f"Unsupported type {type_hints} for argument {name}.")
324
+
325
+ # If the type hint was a sequence of literals, use the helper function
326
+ # to update the type and choices
327
+ if get_origin(kwargs[name].get("type")) is Literal:
328
+ kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]}))
329
+
330
+ # If None is in type_hints, make the argument optional.
331
+ # But not if it's a bool, argparse will handle this better.
332
+ if type(None) in type_hints and not contains_type(type_hints, bool):
333
+ kwargs[name]["type"] = optional_type(kwargs[name]["type"])
334
+ if kwargs[name].get("choices"):
335
+ kwargs[name]["choices"].append("None")
336
+ return kwargs
337
+
338
+
339
+ def get_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
340
+ """Return argparse kwargs for the given Config dataclass.
341
+
342
+ If `--help` or `mkdocs` are not present in the command line command, the
343
+ attribute documentation will not be included in the help output.
344
+
345
+ The heavy computation is cached via functools.lru_cache, and a deep copy
346
+ is returned so callers can mutate the dictionary without affecting the
347
+ cached version.
348
+ """
349
+ return copy.deepcopy(_compute_kwargs(cls))
350
+
351
+
352
+ @dataclass
353
+ class EngineArgs:
354
+ """Arguments for vLLM engine."""
355
+
356
+ model: str = ModelConfig.model
357
+ served_model_name: str | list[str] | None = ModelConfig.served_model_name
358
+ tokenizer: str | None = ModelConfig.tokenizer
359
+ hf_config_path: str | None = ModelConfig.hf_config_path
360
+ runner: RunnerOption = ModelConfig.runner
361
+ convert: ConvertOption = ModelConfig.convert
362
+ skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
363
+ enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
364
+ tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
365
+ trust_remote_code: bool = ModelConfig.trust_remote_code
366
+ allowed_local_media_path: str = ModelConfig.allowed_local_media_path
367
+ allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
368
+ download_dir: str | None = LoadConfig.download_dir
369
+ safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
370
+ load_format: str | LoadFormats = LoadConfig.load_format
371
+ config_format: str = ModelConfig.config_format
372
+ dtype: ModelDType = ModelConfig.dtype
373
+ kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
374
+ seed: int = ModelConfig.seed
375
+ max_model_len: int | None = ModelConfig.max_model_len
376
+ cudagraph_capture_sizes: list[int] | None = (
377
+ CompilationConfig.cudagraph_capture_sizes
378
+ )
379
+ max_cudagraph_capture_size: int | None = get_field(
380
+ CompilationConfig, "max_cudagraph_capture_size"
381
+ )
382
+ # Note: Specifying a custom executor backend by passing a class
383
+ # is intended for expert use only. The API may change without
384
+ # notice.
385
+ distributed_executor_backend: (
386
+ str | DistributedExecutorBackend | type[Executor] | None
387
+ ) = ParallelConfig.distributed_executor_backend
388
+ # number of P/D disaggregation (or other disaggregation) workers
389
+ pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
390
+ master_addr: str = ParallelConfig.master_addr
391
+ master_port: int = ParallelConfig.master_port
392
+ nnodes: int = ParallelConfig.nnodes
393
+ node_rank: int = ParallelConfig.node_rank
394
+ tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
395
+ prefill_context_parallel_size: int = ParallelConfig.prefill_context_parallel_size
396
+ decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
397
+ dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
398
+ cp_kv_cache_interleave_size: int = ParallelConfig.cp_kv_cache_interleave_size
399
+ data_parallel_size: int = ParallelConfig.data_parallel_size
400
+ data_parallel_rank: int | None = None
401
+ data_parallel_start_rank: int | None = None
402
+ data_parallel_size_local: int | None = None
403
+ data_parallel_address: str | None = None
404
+ data_parallel_rpc_port: int | None = None
405
+ data_parallel_hybrid_lb: bool = False
406
+ data_parallel_external_lb: bool = False
407
+ data_parallel_backend: str = ParallelConfig.data_parallel_backend
408
+ enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
409
+ all2all_backend: str | None = ParallelConfig.all2all_backend
410
+ enable_dbo: bool = ParallelConfig.enable_dbo
411
+ dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
412
+ dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
413
+ disable_nccl_for_dp_synchronization: bool = (
414
+ ParallelConfig.disable_nccl_for_dp_synchronization
415
+ )
416
+ eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
417
+ enable_eplb: bool = ParallelConfig.enable_eplb
418
+ expert_placement_strategy: ExpertPlacementStrategy = (
419
+ ParallelConfig.expert_placement_strategy
420
+ )
421
+ _api_process_count: int = ParallelConfig._api_process_count
422
+ _api_process_rank: int = ParallelConfig._api_process_rank
423
+ max_parallel_loading_workers: int | None = (
424
+ ParallelConfig.max_parallel_loading_workers
425
+ )
426
+ block_size: BlockSize | None = CacheConfig.block_size
427
+ enable_prefix_caching: bool | None = None
428
+ prefix_caching_hash_algo: PrefixCachingHashAlgo = (
429
+ CacheConfig.prefix_caching_hash_algo
430
+ )
431
+ disable_sliding_window: bool = ModelConfig.disable_sliding_window
432
+ disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
433
+ swap_space: float = CacheConfig.swap_space
434
+ cpu_offload_gb: float = CacheConfig.cpu_offload_gb
435
+ gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
436
+ kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
437
+ max_num_batched_tokens: int | None = None
438
+ max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
439
+ max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
440
+ long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
441
+ max_num_seqs: int | None = None
442
+ max_logprobs: int = ModelConfig.max_logprobs
443
+ logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
444
+ disable_log_stats: bool = False
445
+ aggregate_engine_logging: bool = False
446
+ revision: str | None = ModelConfig.revision
447
+ code_revision: str | None = ModelConfig.code_revision
448
+ hf_token: bool | str | None = ModelConfig.hf_token
449
+ hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
450
+ tokenizer_revision: str | None = ModelConfig.tokenizer_revision
451
+ quantization: QuantizationMethods | None = ModelConfig.quantization
452
+ enforce_eager: bool = ModelConfig.enforce_eager
453
+ disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
454
+ limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
455
+ MultiModalConfig, "limit_per_prompt"
456
+ )
457
+ enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
458
+ interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
459
+ media_io_kwargs: dict[str, dict[str, Any]] = get_field(
460
+ MultiModalConfig, "media_io_kwargs"
461
+ )
462
+ mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
463
+ mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
464
+ mm_processor_cache_type: MMCacheType | None = (
465
+ MultiModalConfig.mm_processor_cache_type
466
+ )
467
+ mm_shm_cache_max_object_size_mb: int = (
468
+ MultiModalConfig.mm_shm_cache_max_object_size_mb
469
+ )
470
+ mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
471
+ mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
472
+ MultiModalConfig.mm_encoder_attn_backend
473
+ )
474
+ io_processor_plugin: str | None = None
475
+ skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
476
+ video_pruning_rate: float = MultiModalConfig.video_pruning_rate
477
+ # LoRA fields
478
+ enable_lora: bool = False
479
+ max_loras: int = LoRAConfig.max_loras
480
+ max_lora_rank: int = LoRAConfig.max_lora_rank
481
+ default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
482
+ fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
483
+ max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
484
+ lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
485
+
486
+ ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
487
+ num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
488
+ model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
489
+ ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
490
+
491
+ enable_chunked_prefill: bool | None = None
492
+ disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
493
+
494
+ disable_hybrid_kv_cache_manager: bool | None = (
495
+ SchedulerConfig.disable_hybrid_kv_cache_manager
496
+ )
497
+
498
+ structured_outputs_config: StructuredOutputsConfig = get_field(
499
+ VllmConfig, "structured_outputs_config"
500
+ )
501
+ reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
502
+ reasoning_parser_plugin: str | None = None
503
+
504
+ logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
505
+
506
+ speculative_config: dict[str, Any] | None = None
507
+
508
+ show_hidden_metrics_for_version: str | None = (
509
+ ObservabilityConfig.show_hidden_metrics_for_version
510
+ )
511
+ otlp_traces_endpoint: str | None = ObservabilityConfig.otlp_traces_endpoint
512
+ collect_detailed_traces: list[DetailedTraceModules] | None = (
513
+ ObservabilityConfig.collect_detailed_traces
514
+ )
515
+ kv_cache_metrics: bool = ObservabilityConfig.kv_cache_metrics
516
+ kv_cache_metrics_sample: float = get_field(
517
+ ObservabilityConfig, "kv_cache_metrics_sample"
518
+ )
519
+ cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics
520
+ enable_layerwise_nvtx_tracing: bool = (
521
+ ObservabilityConfig.enable_layerwise_nvtx_tracing
522
+ )
523
+ scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
524
+ scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
525
+
526
+ pooler_config: PoolerConfig | None = ModelConfig.pooler_config
527
+ compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
528
+ attention_config: AttentionConfig = get_field(VllmConfig, "attention_config")
529
+ worker_cls: str = ParallelConfig.worker_cls
530
+ worker_extension_cls: str = ParallelConfig.worker_extension_cls
531
+
532
+ profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config")
533
+
534
+ kv_transfer_config: KVTransferConfig | None = None
535
+ kv_events_config: KVEventsConfig | None = None
536
+
537
+ ec_transfer_config: ECTransferConfig | None = None
538
+
539
+ generation_config: str = ModelConfig.generation_config
540
+ enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
541
+ override_generation_config: dict[str, Any] = get_field(
542
+ ModelConfig, "override_generation_config"
543
+ )
544
+ model_impl: str = ModelConfig.model_impl
545
+ override_attention_dtype: str = ModelConfig.override_attention_dtype
546
+ attention_backend: AttentionBackendEnum | None = AttentionConfig.backend
547
+
548
+ calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
549
+ mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
550
+ mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype
551
+ mamba_block_size: int | None = get_field(CacheConfig, "mamba_block_size")
552
+
553
+ additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")
554
+
555
+ use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
556
+ pt_load_map_location: str = LoadConfig.pt_load_map_location
557
+
558
+ logits_processors: list[str | type[LogitsProcessor]] | None = (
559
+ ModelConfig.logits_processors
560
+ )
561
+ """Custom logitproc types"""
562
+
563
+ async_scheduling: bool | None = SchedulerConfig.async_scheduling
564
+
565
+ stream_interval: int = SchedulerConfig.stream_interval
566
+
567
+ kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
568
+ optimization_level: OptimizationLevel = VllmConfig.optimization_level
569
+
570
+ kv_offloading_size: float | None = CacheConfig.kv_offloading_size
571
+ kv_offloading_backend: KVOffloadingBackend | None = (
572
+ CacheConfig.kv_offloading_backend
573
+ )
574
+ tokens_only: bool = False
575
+
576
+ def __post_init__(self):
577
+ # support `EngineArgs(compilation_config={...})`
578
+ # without having to manually construct a
579
+ # CompilationConfig object
580
+ if isinstance(self.compilation_config, dict):
581
+ self.compilation_config = CompilationConfig(**self.compilation_config)
582
+ if isinstance(self.attention_config, dict):
583
+ self.attention_config = AttentionConfig(**self.attention_config)
584
+ if isinstance(self.eplb_config, dict):
585
+ self.eplb_config = EPLBConfig(**self.eplb_config)
586
+ # Setup plugins
587
+ from vllm.plugins import load_general_plugins
588
+
589
+ load_general_plugins()
590
+ # when use hf offline,replace model and tokenizer id to local model path
591
+ if huggingface_hub.constants.HF_HUB_OFFLINE:
592
+ model_id = self.model
593
+ self.model = get_model_path(self.model, self.revision)
594
+ if model_id is not self.model:
595
+ logger.info(
596
+ "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
597
+ model_id,
598
+ self.model,
599
+ )
600
+ if self.tokenizer is not None:
601
+ tokenizer_id = self.tokenizer
602
+ self.tokenizer = get_model_path(self.tokenizer, self.tokenizer_revision)
603
+ if tokenizer_id is not self.tokenizer:
604
+ logger.info(
605
+ "HF_HUB_OFFLINE is True, replace tokenizer_id [%s] "
606
+ "to tokenizer_path [%s]",
607
+ tokenizer_id,
608
+ self.tokenizer,
609
+ )
610
+
611
+ @staticmethod
612
+ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
613
+ """Shared CLI arguments for vLLM engine."""
614
+
615
+ # Model arguments
616
+ model_kwargs = get_kwargs(ModelConfig)
617
+ model_group = parser.add_argument_group(
618
+ title="ModelConfig",
619
+ description=ModelConfig.__doc__,
620
+ )
621
+ if not ("serve" in sys.argv[1:] and "--help" in sys.argv[1:]):
622
+ model_group.add_argument("--model", **model_kwargs["model"])
623
+ model_group.add_argument("--runner", **model_kwargs["runner"])
624
+ model_group.add_argument("--convert", **model_kwargs["convert"])
625
+ model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
626
+ model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
627
+ model_group.add_argument(
628
+ "--trust-remote-code", **model_kwargs["trust_remote_code"]
629
+ )
630
+ model_group.add_argument("--dtype", **model_kwargs["dtype"])
631
+ model_group.add_argument("--seed", **model_kwargs["seed"])
632
+ model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
633
+ model_group.add_argument(
634
+ "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
635
+ )
636
+ model_group.add_argument(
637
+ "--allowed-media-domains", **model_kwargs["allowed_media_domains"]
638
+ )
639
+ model_group.add_argument("--revision", **model_kwargs["revision"])
640
+ model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
641
+ model_group.add_argument(
642
+ "--tokenizer-revision", **model_kwargs["tokenizer_revision"]
643
+ )
644
+ model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
645
+ model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
646
+ model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
647
+ model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"])
648
+ model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"])
649
+ model_group.add_argument(
650
+ "--disable-sliding-window", **model_kwargs["disable_sliding_window"]
651
+ )
652
+ model_group.add_argument(
653
+ "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
654
+ )
655
+ model_group.add_argument(
656
+ "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
657
+ )
658
+ model_group.add_argument(
659
+ "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
660
+ )
661
+ model_group.add_argument(
662
+ "--served-model-name", **model_kwargs["served_model_name"]
663
+ )
664
+ model_group.add_argument("--config-format", **model_kwargs["config_format"])
665
+ # This one is a special case because it can bool
666
+ # or str. TODO: Handle this in get_kwargs
667
+ model_group.add_argument(
668
+ "--hf-token",
669
+ type=str,
670
+ nargs="?",
671
+ const=True,
672
+ default=model_kwargs["hf_token"]["default"],
673
+ help=model_kwargs["hf_token"]["help"],
674
+ )
675
+ model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
676
+ model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
677
+ model_group.add_argument(
678
+ "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
679
+ )
680
+ model_group.add_argument(
681
+ "--generation-config", **model_kwargs["generation_config"]
682
+ )
683
+ model_group.add_argument(
684
+ "--override-generation-config", **model_kwargs["override_generation_config"]
685
+ )
686
+ model_group.add_argument(
687
+ "--enable-sleep-mode", **model_kwargs["enable_sleep_mode"]
688
+ )
689
+ model_group.add_argument("--model-impl", **model_kwargs["model_impl"])
690
+ model_group.add_argument(
691
+ "--override-attention-dtype", **model_kwargs["override_attention_dtype"]
692
+ )
693
+ model_group.add_argument(
694
+ "--logits-processors", **model_kwargs["logits_processors"]
695
+ )
696
+ model_group.add_argument(
697
+ "--io-processor-plugin", **model_kwargs["io_processor_plugin"]
698
+ )
699
+
700
+ # Model loading arguments
701
+ load_kwargs = get_kwargs(LoadConfig)
702
+ load_group = parser.add_argument_group(
703
+ title="LoadConfig",
704
+ description=LoadConfig.__doc__,
705
+ )
706
+ load_group.add_argument("--load-format", **load_kwargs["load_format"])
707
+ load_group.add_argument("--download-dir", **load_kwargs["download_dir"])
708
+ load_group.add_argument(
709
+ "--safetensors-load-strategy", **load_kwargs["safetensors_load_strategy"]
710
+ )
711
+ load_group.add_argument(
712
+ "--model-loader-extra-config", **load_kwargs["model_loader_extra_config"]
713
+ )
714
+ load_group.add_argument("--ignore-patterns", **load_kwargs["ignore_patterns"])
715
+ load_group.add_argument("--use-tqdm-on-load", **load_kwargs["use_tqdm_on_load"])
716
+ load_group.add_argument(
717
+ "--pt-load-map-location", **load_kwargs["pt_load_map_location"]
718
+ )
719
+
720
+ # Attention arguments
721
+ attention_kwargs = get_kwargs(AttentionConfig)
722
+ attention_group = parser.add_argument_group(
723
+ title="AttentionConfig",
724
+ description=AttentionConfig.__doc__,
725
+ )
726
+ attention_group.add_argument(
727
+ "--attention-backend", **attention_kwargs["backend"]
728
+ )
729
+
730
+ # Structured outputs arguments
731
+ structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
732
+ structured_outputs_group = parser.add_argument_group(
733
+ title="StructuredOutputsConfig",
734
+ description=StructuredOutputsConfig.__doc__,
735
+ )
736
+ structured_outputs_group.add_argument(
737
+ "--reasoning-parser",
738
+ # Choices need to be validated after parsing to include plugins
739
+ **structured_outputs_kwargs["reasoning_parser"],
740
+ )
741
+ structured_outputs_group.add_argument(
742
+ "--reasoning-parser-plugin",
743
+ **structured_outputs_kwargs["reasoning_parser_plugin"],
744
+ )
745
+
746
+ # Parallel arguments
747
+ parallel_kwargs = get_kwargs(ParallelConfig)
748
+ parallel_group = parser.add_argument_group(
749
+ title="ParallelConfig",
750
+ description=ParallelConfig.__doc__,
751
+ )
752
+ parallel_group.add_argument(
753
+ "--distributed-executor-backend",
754
+ **parallel_kwargs["distributed_executor_backend"],
755
+ )
756
+ parallel_group.add_argument(
757
+ "--pipeline-parallel-size",
758
+ "-pp",
759
+ **parallel_kwargs["pipeline_parallel_size"],
760
+ )
761
+ parallel_group.add_argument("--master-addr", **parallel_kwargs["master_addr"])
762
+ parallel_group.add_argument("--master-port", **parallel_kwargs["master_port"])
763
+ parallel_group.add_argument("--nnodes", "-n", **parallel_kwargs["nnodes"])
764
+ parallel_group.add_argument("--node-rank", "-r", **parallel_kwargs["node_rank"])
765
+ parallel_group.add_argument(
766
+ "--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"]
767
+ )
768
+ parallel_group.add_argument(
769
+ "--decode-context-parallel-size",
770
+ "-dcp",
771
+ **parallel_kwargs["decode_context_parallel_size"],
772
+ )
773
+ parallel_group.add_argument(
774
+ "--dcp-kv-cache-interleave-size",
775
+ **parallel_kwargs["dcp_kv_cache_interleave_size"],
776
+ )
777
+ parallel_group.add_argument(
778
+ "--cp-kv-cache-interleave-size",
779
+ **parallel_kwargs["cp_kv_cache_interleave_size"],
780
+ )
781
+ parallel_group.add_argument(
782
+ "--prefill-context-parallel-size",
783
+ "-pcp",
784
+ **parallel_kwargs["prefill_context_parallel_size"],
785
+ )
786
+ parallel_group.add_argument(
787
+ "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]
788
+ )
789
+ parallel_group.add_argument(
790
+ "--data-parallel-rank",
791
+ "-dpn",
792
+ type=int,
793
+ help="Data parallel rank of this instance. "
794
+ "When set, enables external load balancer mode.",
795
+ )
796
+ parallel_group.add_argument(
797
+ "--data-parallel-start-rank",
798
+ "-dpr",
799
+ type=int,
800
+ help="Starting data parallel rank for secondary nodes.",
801
+ )
802
+ parallel_group.add_argument(
803
+ "--data-parallel-size-local",
804
+ "-dpl",
805
+ type=int,
806
+ help="Number of data parallel replicas to run on this node.",
807
+ )
808
+ parallel_group.add_argument(
809
+ "--data-parallel-address",
810
+ "-dpa",
811
+ type=str,
812
+ help="Address of data parallel cluster head-node.",
813
+ )
814
+ parallel_group.add_argument(
815
+ "--data-parallel-rpc-port",
816
+ "-dpp",
817
+ type=int,
818
+ help="Port for data parallel RPC communication.",
819
+ )
820
+ parallel_group.add_argument(
821
+ "--data-parallel-backend",
822
+ "-dpb",
823
+ type=str,
824
+ default="mp",
825
+ help='Backend for data parallel, either "mp" or "ray".',
826
+ )
827
+ parallel_group.add_argument(
828
+ "--data-parallel-hybrid-lb",
829
+ "-dph",
830
+ **parallel_kwargs["data_parallel_hybrid_lb"],
831
+ )
832
+ parallel_group.add_argument(
833
+ "--data-parallel-external-lb",
834
+ "-dpe",
835
+ **parallel_kwargs["data_parallel_external_lb"],
836
+ )
837
+ parallel_group.add_argument(
838
+ "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]
839
+ )
840
+ parallel_group.add_argument(
841
+ "--all2all-backend", **parallel_kwargs["all2all_backend"]
842
+ )
843
+ parallel_group.add_argument("--enable-dbo", **parallel_kwargs["enable_dbo"])
844
+ parallel_group.add_argument(
845
+ "--dbo-decode-token-threshold",
846
+ **parallel_kwargs["dbo_decode_token_threshold"],
847
+ )
848
+ parallel_group.add_argument(
849
+ "--dbo-prefill-token-threshold",
850
+ **parallel_kwargs["dbo_prefill_token_threshold"],
851
+ )
852
+ parallel_group.add_argument(
853
+ "--disable-nccl-for-dp-synchronization",
854
+ **parallel_kwargs["disable_nccl_for_dp_synchronization"],
855
+ )
856
+ parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"])
857
+ parallel_group.add_argument("--eplb-config", **parallel_kwargs["eplb_config"])
858
+ parallel_group.add_argument(
859
+ "--expert-placement-strategy",
860
+ **parallel_kwargs["expert_placement_strategy"],
861
+ )
862
+
863
+ parallel_group.add_argument(
864
+ "--max-parallel-loading-workers",
865
+ **parallel_kwargs["max_parallel_loading_workers"],
866
+ )
867
+ parallel_group.add_argument(
868
+ "--ray-workers-use-nsight", **parallel_kwargs["ray_workers_use_nsight"]
869
+ )
870
+ parallel_group.add_argument(
871
+ "--disable-custom-all-reduce",
872
+ **parallel_kwargs["disable_custom_all_reduce"],
873
+ )
874
+ parallel_group.add_argument("--worker-cls", **parallel_kwargs["worker_cls"])
875
+ parallel_group.add_argument(
876
+ "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"]
877
+ )
878
+
879
+ # KV cache arguments
880
+ cache_kwargs = get_kwargs(CacheConfig)
881
+ cache_group = parser.add_argument_group(
882
+ title="CacheConfig",
883
+ description=CacheConfig.__doc__,
884
+ )
885
+ cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
886
+ cache_group.add_argument(
887
+ "--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]
888
+ )
889
+ cache_group.add_argument(
890
+ "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
891
+ )
892
+ cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
893
+ cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
894
+ cache_group.add_argument(
895
+ "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
896
+ )
897
+ cache_group.add_argument(
898
+ "--enable-prefix-caching",
899
+ **{
900
+ **cache_kwargs["enable_prefix_caching"],
901
+ "default": None,
902
+ },
903
+ )
904
+ cache_group.add_argument(
905
+ "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
906
+ )
907
+ cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"])
908
+ cache_group.add_argument(
909
+ "--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]
910
+ )
911
+ cache_group.add_argument(
912
+ "--kv-sharing-fast-prefill", **cache_kwargs["kv_sharing_fast_prefill"]
913
+ )
914
+ cache_group.add_argument(
915
+ "--mamba-cache-dtype", **cache_kwargs["mamba_cache_dtype"]
916
+ )
917
+ cache_group.add_argument(
918
+ "--mamba-ssm-cache-dtype", **cache_kwargs["mamba_ssm_cache_dtype"]
919
+ )
920
+ cache_group.add_argument(
921
+ "--mamba-block-size", **cache_kwargs["mamba_block_size"]
922
+ )
923
+ cache_group.add_argument(
924
+ "--kv-offloading-size", **cache_kwargs["kv_offloading_size"]
925
+ )
926
+ cache_group.add_argument(
927
+ "--kv-offloading-backend", **cache_kwargs["kv_offloading_backend"]
928
+ )
929
+
930
+ # Multimodal related configs
931
+ multimodal_kwargs = get_kwargs(MultiModalConfig)
932
+ multimodal_group = parser.add_argument_group(
933
+ title="MultiModalConfig",
934
+ description=MultiModalConfig.__doc__,
935
+ )
936
+ multimodal_group.add_argument(
937
+ "--limit-mm-per-prompt", **multimodal_kwargs["limit_per_prompt"]
938
+ )
939
+ multimodal_group.add_argument(
940
+ "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
941
+ )
942
+ multimodal_group.add_argument(
943
+ "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
944
+ )
945
+ multimodal_group.add_argument(
946
+ "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
947
+ )
948
+ multimodal_group.add_argument(
949
+ "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]
950
+ )
951
+ multimodal_group.add_argument(
952
+ "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"]
953
+ )
954
+ multimodal_group.add_argument(
955
+ "--mm-shm-cache-max-object-size-mb",
956
+ **multimodal_kwargs["mm_shm_cache_max_object_size_mb"],
957
+ )
958
+ multimodal_group.add_argument(
959
+ "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"]
960
+ )
961
+ multimodal_group.add_argument(
962
+ "--mm-encoder-attn-backend",
963
+ **multimodal_kwargs["mm_encoder_attn_backend"],
964
+ )
965
+ multimodal_group.add_argument(
966
+ "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"]
967
+ )
968
+ multimodal_group.add_argument(
969
+ "--skip-mm-profiling", **multimodal_kwargs["skip_mm_profiling"]
970
+ )
971
+
972
+ multimodal_group.add_argument(
973
+ "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"]
974
+ )
975
+
976
+ # LoRA related configs
977
+ lora_kwargs = get_kwargs(LoRAConfig)
978
+ lora_group = parser.add_argument_group(
979
+ title="LoRAConfig",
980
+ description=LoRAConfig.__doc__,
981
+ )
982
+ lora_group.add_argument(
983
+ "--enable-lora",
984
+ action=argparse.BooleanOptionalAction,
985
+ help="If True, enable handling of LoRA adapters.",
986
+ )
987
+ lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
988
+ lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"])
989
+ lora_group.add_argument(
990
+ "--lora-dtype",
991
+ **lora_kwargs["lora_dtype"],
992
+ )
993
+ lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"])
994
+ lora_group.add_argument(
995
+ "--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]
996
+ )
997
+ lora_group.add_argument("--default-mm-loras", **lora_kwargs["default_mm_loras"])
998
+
999
+ # Observability arguments
1000
+ observability_kwargs = get_kwargs(ObservabilityConfig)
1001
+ observability_group = parser.add_argument_group(
1002
+ title="ObservabilityConfig",
1003
+ description=ObservabilityConfig.__doc__,
1004
+ )
1005
+ observability_group.add_argument(
1006
+ "--show-hidden-metrics-for-version",
1007
+ **observability_kwargs["show_hidden_metrics_for_version"],
1008
+ )
1009
+ observability_group.add_argument(
1010
+ "--otlp-traces-endpoint", **observability_kwargs["otlp_traces_endpoint"]
1011
+ )
1012
+ # TODO: generalise this special case
1013
+ choices = observability_kwargs["collect_detailed_traces"]["choices"]
1014
+ metavar = f"{{{','.join(choices)}}}"
1015
+ observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
1016
+ observability_kwargs["collect_detailed_traces"]["choices"] += [
1017
+ ",".join(p) for p in permutations(get_args(DetailedTraceModules), r=2)
1018
+ ]
1019
+ observability_group.add_argument(
1020
+ "--collect-detailed-traces",
1021
+ **observability_kwargs["collect_detailed_traces"],
1022
+ )
1023
+ observability_group.add_argument(
1024
+ "--kv-cache-metrics", **observability_kwargs["kv_cache_metrics"]
1025
+ )
1026
+ observability_group.add_argument(
1027
+ "--kv-cache-metrics-sample",
1028
+ **observability_kwargs["kv_cache_metrics_sample"],
1029
+ )
1030
+ observability_group.add_argument(
1031
+ "--cudagraph-metrics",
1032
+ **observability_kwargs["cudagraph_metrics"],
1033
+ )
1034
+ observability_group.add_argument(
1035
+ "--enable-layerwise-nvtx-tracing",
1036
+ **observability_kwargs["enable_layerwise_nvtx_tracing"],
1037
+ )
1038
+
1039
+ # Scheduler arguments
1040
+ scheduler_kwargs = get_kwargs(SchedulerConfig)
1041
+ scheduler_group = parser.add_argument_group(
1042
+ title="SchedulerConfig",
1043
+ description=SchedulerConfig.__doc__,
1044
+ )
1045
+ scheduler_group.add_argument(
1046
+ "--max-num-batched-tokens",
1047
+ **{
1048
+ **scheduler_kwargs["max_num_batched_tokens"],
1049
+ "default": None,
1050
+ },
1051
+ )
1052
+ scheduler_group.add_argument(
1053
+ "--max-num-seqs",
1054
+ **{
1055
+ **scheduler_kwargs["max_num_seqs"],
1056
+ "default": None,
1057
+ },
1058
+ )
1059
+ scheduler_group.add_argument(
1060
+ "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"]
1061
+ )
1062
+ scheduler_group.add_argument(
1063
+ "--max-long-partial-prefills",
1064
+ **scheduler_kwargs["max_long_partial_prefills"],
1065
+ )
1066
+ scheduler_group.add_argument(
1067
+ "--long-prefill-token-threshold",
1068
+ **scheduler_kwargs["long_prefill_token_threshold"],
1069
+ )
1070
+ # multi-step scheduling has been removed; corresponding arguments
1071
+ # are no longer supported.
1072
+ scheduler_group.add_argument(
1073
+ "--scheduling-policy", **scheduler_kwargs["policy"]
1074
+ )
1075
+ scheduler_group.add_argument(
1076
+ "--enable-chunked-prefill",
1077
+ **{
1078
+ **scheduler_kwargs["enable_chunked_prefill"],
1079
+ "default": None,
1080
+ },
1081
+ )
1082
+ scheduler_group.add_argument(
1083
+ "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]
1084
+ )
1085
+ scheduler_group.add_argument(
1086
+ "--scheduler-cls", **scheduler_kwargs["scheduler_cls"]
1087
+ )
1088
+ scheduler_group.add_argument(
1089
+ "--disable-hybrid-kv-cache-manager",
1090
+ **scheduler_kwargs["disable_hybrid_kv_cache_manager"],
1091
+ )
1092
+ scheduler_group.add_argument(
1093
+ "--async-scheduling", **scheduler_kwargs["async_scheduling"]
1094
+ )
1095
+ scheduler_group.add_argument(
1096
+ "--stream-interval", **scheduler_kwargs["stream_interval"]
1097
+ )
1098
+
1099
+ # Compilation arguments
1100
+ compilation_kwargs = get_kwargs(CompilationConfig)
1101
+ compilation_group = parser.add_argument_group(
1102
+ title="CompilationConfig",
1103
+ description=CompilationConfig.__doc__,
1104
+ )
1105
+ compilation_group.add_argument(
1106
+ "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
1107
+ )
1108
+ compilation_group.add_argument(
1109
+ "--max-cudagraph-capture-size",
1110
+ **compilation_kwargs["max_cudagraph_capture_size"],
1111
+ )
1112
+
1113
+ # vLLM arguments
1114
+ vllm_kwargs = get_kwargs(VllmConfig)
1115
+ vllm_group = parser.add_argument_group(
1116
+ title="VllmConfig",
1117
+ description=VllmConfig.__doc__,
1118
+ )
1119
+ # We construct SpeculativeConfig using fields from other configs in
1120
+ # create_engine_config. So we set the type to a JSON string here to
1121
+ # delay the Pydantic validation that comes with SpeculativeConfig.
1122
+ vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads)
1123
+ vllm_group.add_argument(
1124
+ "--speculative-config", **vllm_kwargs["speculative_config"]
1125
+ )
1126
+ vllm_group.add_argument(
1127
+ "--kv-transfer-config", **vllm_kwargs["kv_transfer_config"]
1128
+ )
1129
+ vllm_group.add_argument("--kv-events-config", **vllm_kwargs["kv_events_config"])
1130
+ vllm_group.add_argument(
1131
+ "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
1132
+ )
1133
+ vllm_group.add_argument(
1134
+ "--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
1135
+ )
1136
+ vllm_group.add_argument(
1137
+ "--attention-config", "-ac", **vllm_kwargs["attention_config"]
1138
+ )
1139
+ vllm_group.add_argument(
1140
+ "--additional-config", **vllm_kwargs["additional_config"]
1141
+ )
1142
+ vllm_group.add_argument(
1143
+ "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
1144
+ )
1145
+ vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"])
1146
+ vllm_group.add_argument(
1147
+ "--optimization-level", **vllm_kwargs["optimization_level"]
1148
+ )
1149
+
1150
+ # Other arguments
1151
+ parser.add_argument(
1152
+ "--disable-log-stats",
1153
+ action="store_true",
1154
+ help="Disable logging statistics.",
1155
+ )
1156
+
1157
+ parser.add_argument(
1158
+ "--aggregate-engine-logging",
1159
+ action="store_true",
1160
+ help="Log aggregate rather than per-engine statistics "
1161
+ "when using data parallelism.",
1162
+ )
1163
+ return parser
1164
+
1165
+ @classmethod
1166
+ def from_cli_args(cls, args: argparse.Namespace):
1167
+ # Get the list of attributes of this dataclass.
1168
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
1169
+ # Set the attributes from the parsed arguments.
1170
+ engine_args = cls(
1171
+ **{attr: getattr(args, attr) for attr in attrs if hasattr(args, attr)}
1172
+ )
1173
+ return engine_args
1174
+
1175
+ def create_model_config(self) -> ModelConfig:
1176
+ # gguf file needs a specific model loader
1177
+ if is_gguf(self.model):
1178
+ self.quantization = self.load_format = "gguf"
1179
+
1180
+ if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
1181
+ logger.warning(
1182
+ "The global random seed is set to %d. Since "
1183
+ "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
1184
+ "affect the random state of the Python process that "
1185
+ "launched vLLM.",
1186
+ self.seed,
1187
+ )
1188
+
1189
+ return ModelConfig(
1190
+ model=self.model,
1191
+ hf_config_path=self.hf_config_path,
1192
+ runner=self.runner,
1193
+ convert=self.convert,
1194
+ tokenizer=self.tokenizer,
1195
+ tokenizer_mode=self.tokenizer_mode,
1196
+ trust_remote_code=self.trust_remote_code,
1197
+ allowed_local_media_path=self.allowed_local_media_path,
1198
+ allowed_media_domains=self.allowed_media_domains,
1199
+ dtype=self.dtype,
1200
+ seed=self.seed,
1201
+ revision=self.revision,
1202
+ code_revision=self.code_revision,
1203
+ hf_token=self.hf_token,
1204
+ hf_overrides=self.hf_overrides,
1205
+ tokenizer_revision=self.tokenizer_revision,
1206
+ max_model_len=self.max_model_len,
1207
+ quantization=self.quantization,
1208
+ enforce_eager=self.enforce_eager,
1209
+ max_logprobs=self.max_logprobs,
1210
+ logprobs_mode=self.logprobs_mode,
1211
+ disable_sliding_window=self.disable_sliding_window,
1212
+ disable_cascade_attn=self.disable_cascade_attn,
1213
+ skip_tokenizer_init=self.skip_tokenizer_init,
1214
+ enable_prompt_embeds=self.enable_prompt_embeds,
1215
+ served_model_name=self.served_model_name,
1216
+ limit_mm_per_prompt=self.limit_mm_per_prompt,
1217
+ enable_mm_embeds=self.enable_mm_embeds,
1218
+ interleave_mm_strings=self.interleave_mm_strings,
1219
+ media_io_kwargs=self.media_io_kwargs,
1220
+ skip_mm_profiling=self.skip_mm_profiling,
1221
+ config_format=self.config_format,
1222
+ mm_processor_kwargs=self.mm_processor_kwargs,
1223
+ mm_processor_cache_gb=self.mm_processor_cache_gb,
1224
+ mm_processor_cache_type=self.mm_processor_cache_type,
1225
+ mm_shm_cache_max_object_size_mb=self.mm_shm_cache_max_object_size_mb,
1226
+ mm_encoder_tp_mode=self.mm_encoder_tp_mode,
1227
+ mm_encoder_attn_backend=self.mm_encoder_attn_backend,
1228
+ pooler_config=self.pooler_config,
1229
+ logits_processor_pattern=self.logits_processor_pattern,
1230
+ generation_config=self.generation_config,
1231
+ override_generation_config=self.override_generation_config,
1232
+ enable_sleep_mode=self.enable_sleep_mode,
1233
+ model_impl=self.model_impl,
1234
+ override_attention_dtype=self.override_attention_dtype,
1235
+ logits_processors=self.logits_processors,
1236
+ video_pruning_rate=self.video_pruning_rate,
1237
+ io_processor_plugin=self.io_processor_plugin,
1238
+ )
1239
+
1240
+ def validate_tensorizer_args(self):
1241
+ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
1242
+
1243
+ for key in self.model_loader_extra_config:
1244
+ if key in TensorizerConfig._fields:
1245
+ self.model_loader_extra_config["tensorizer_config"][key] = (
1246
+ self.model_loader_extra_config[key]
1247
+ )
1248
+
1249
+ def create_load_config(self) -> LoadConfig:
1250
+ if self.quantization == "bitsandbytes":
1251
+ self.load_format = "bitsandbytes"
1252
+
1253
+ if self.load_format == "tensorizer":
1254
+ if hasattr(self.model_loader_extra_config, "to_serializable"):
1255
+ self.model_loader_extra_config = (
1256
+ self.model_loader_extra_config.to_serializable()
1257
+ )
1258
+ self.model_loader_extra_config["tensorizer_config"] = {}
1259
+ self.model_loader_extra_config["tensorizer_config"]["tensorizer_dir"] = (
1260
+ self.model
1261
+ )
1262
+ self.validate_tensorizer_args()
1263
+
1264
+ return LoadConfig(
1265
+ load_format=self.load_format,
1266
+ download_dir=self.download_dir,
1267
+ safetensors_load_strategy=self.safetensors_load_strategy,
1268
+ device="cpu" if is_online_quantization(self.quantization) else None,
1269
+ model_loader_extra_config=self.model_loader_extra_config,
1270
+ ignore_patterns=self.ignore_patterns,
1271
+ use_tqdm_on_load=self.use_tqdm_on_load,
1272
+ pt_load_map_location=self.pt_load_map_location,
1273
+ )
1274
+
1275
+ def create_speculative_config(
1276
+ self,
1277
+ target_model_config: ModelConfig,
1278
+ target_parallel_config: ParallelConfig,
1279
+ ) -> SpeculativeConfig | None:
1280
+ """Initializes and returns a SpeculativeConfig object based on
1281
+ `speculative_config`.
1282
+
1283
+ This function utilizes `speculative_config` to create a
1284
+ SpeculativeConfig object. The `speculative_config` can either be
1285
+ provided as a JSON string input via CLI arguments or directly as a
1286
+ dictionary from the engine.
1287
+ """
1288
+ if self.speculative_config is None:
1289
+ return None
1290
+
1291
+ # Note(Shangming): These parameters are not obtained from the cli arg
1292
+ # '--speculative-config' and must be passed in when creating the engine
1293
+ # config.
1294
+ self.speculative_config.update(
1295
+ {
1296
+ "target_model_config": target_model_config,
1297
+ "target_parallel_config": target_parallel_config,
1298
+ }
1299
+ )
1300
+ return SpeculativeConfig(**self.speculative_config)
1301
+
1302
+ def create_engine_config(
1303
+ self,
1304
+ usage_context: UsageContext | None = None,
1305
+ headless: bool = False,
1306
+ ) -> VllmConfig:
1307
+ """
1308
+ Create the VllmConfig.
1309
+
1310
+ NOTE: If VllmConfig is incompatible, we raise an error.
1311
+ """
1312
+ current_platform.pre_register_and_update()
1313
+
1314
+ device_config = DeviceConfig(device=cast(Device, current_platform.device_type))
1315
+
1316
+ # Check if the model is a speculator and override model/tokenizer/config
1317
+ # BEFORE creating ModelConfig, so the config is created with the target model
1318
+ # Skip speculator detection for cloud storage models (eg: S3, GCS) since
1319
+ # HuggingFace cannot load configs directly from S3 URLs. S3 models can still
1320
+ # use speculators with explicit --speculative-config.
1321
+ if not is_cloud_storage(self.model):
1322
+ (self.model, self.tokenizer, self.speculative_config) = (
1323
+ maybe_override_with_speculators(
1324
+ model=self.model,
1325
+ tokenizer=self.tokenizer,
1326
+ revision=self.revision,
1327
+ trust_remote_code=self.trust_remote_code,
1328
+ vllm_speculative_config=self.speculative_config,
1329
+ )
1330
+ )
1331
+
1332
+ model_config = self.create_model_config()
1333
+ self.model = model_config.model
1334
+ self.tokenizer = model_config.tokenizer
1335
+
1336
+ self._check_feature_supported(model_config)
1337
+ self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
1338
+ self._set_default_max_num_seqs_and_batched_tokens_args(
1339
+ usage_context, model_config
1340
+ )
1341
+
1342
+ sliding_window: int | None = None
1343
+ if not is_interleaved(model_config.hf_text_config):
1344
+ # Only set CacheConfig.sliding_window if the model is all sliding
1345
+ # window. Otherwise CacheConfig.sliding_window will override the
1346
+ # global layers in interleaved sliding window models.
1347
+ sliding_window = model_config.get_sliding_window()
1348
+
1349
+ # Note(hc): In the current implementation of decode context
1350
+ # parallel(DCP), tp_size needs to be divisible by dcp_size,
1351
+ # because the world size does not change by dcp, it simply
1352
+ # reuses the GPUs of TP group, and split one TP group into
1353
+ # tp_size//dcp_size DCP groups.
1354
+ assert self.tensor_parallel_size % self.decode_context_parallel_size == 0, (
1355
+ f"tp_size={self.tensor_parallel_size} must be divisible by"
1356
+ f"dcp_size={self.decode_context_parallel_size}."
1357
+ )
1358
+
1359
+ cache_config = CacheConfig(
1360
+ block_size=self.block_size,
1361
+ gpu_memory_utilization=self.gpu_memory_utilization,
1362
+ kv_cache_memory_bytes=self.kv_cache_memory_bytes,
1363
+ swap_space=self.swap_space,
1364
+ cache_dtype=self.kv_cache_dtype,
1365
+ is_attention_free=model_config.is_attention_free,
1366
+ num_gpu_blocks_override=self.num_gpu_blocks_override,
1367
+ sliding_window=sliding_window,
1368
+ enable_prefix_caching=self.enable_prefix_caching,
1369
+ prefix_caching_hash_algo=self.prefix_caching_hash_algo,
1370
+ cpu_offload_gb=self.cpu_offload_gb,
1371
+ calculate_kv_scales=self.calculate_kv_scales,
1372
+ kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
1373
+ mamba_cache_dtype=self.mamba_cache_dtype,
1374
+ mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
1375
+ mamba_block_size=self.mamba_block_size,
1376
+ kv_offloading_size=self.kv_offloading_size,
1377
+ kv_offloading_backend=self.kv_offloading_backend,
1378
+ )
1379
+
1380
+ ray_runtime_env = None
1381
+ if is_ray_initialized():
1382
+ # Ray Serve LLM calls `create_engine_config` in the context
1383
+ # of a Ray task, therefore we check is_ray_initialized()
1384
+ # as opposed to is_in_ray_actor().
1385
+ import ray
1386
+
1387
+ ray_runtime_env = ray.get_runtime_context().runtime_env
1388
+ # Avoid logging sensitive environment variables
1389
+ sanitized_env = ray_runtime_env.to_dict() if ray_runtime_env else {}
1390
+ if "env_vars" in sanitized_env:
1391
+ sanitized_env["env_vars"] = {
1392
+ k: "***" for k in sanitized_env["env_vars"]
1393
+ }
1394
+ logger.info("Using ray runtime env (env vars redacted): %s", sanitized_env)
1395
+
1396
+ # Get the current placement group if Ray is initialized and
1397
+ # we are in a Ray actor. If so, then the placement group will be
1398
+ # passed to spawned processes.
1399
+ placement_group = None
1400
+ if is_in_ray_actor():
1401
+ import ray
1402
+
1403
+ # This call initializes Ray automatically if it is not initialized,
1404
+ # but we should not do this here.
1405
+ placement_group = ray.util.get_current_placement_group()
1406
+
1407
+ assert not headless or not self.data_parallel_hybrid_lb, (
1408
+ "data_parallel_hybrid_lb is not applicable in headless mode"
1409
+ )
1410
+ assert not (self.data_parallel_hybrid_lb and self.data_parallel_external_lb), (
1411
+ "data_parallel_hybrid_lb and data_parallel_external_lb cannot both be True."
1412
+ )
1413
+ assert self.data_parallel_backend == "mp" or self.nnodes == 1, (
1414
+ "nnodes > 1 is only supported with data_parallel_backend=mp"
1415
+ )
1416
+ inferred_data_parallel_rank = 0
1417
+ if self.nnodes > 1:
1418
+ world_size = (
1419
+ self.data_parallel_size
1420
+ * self.pipeline_parallel_size
1421
+ * self.tensor_parallel_size
1422
+ )
1423
+ world_size_within_dp = (
1424
+ self.pipeline_parallel_size * self.tensor_parallel_size
1425
+ )
1426
+ local_world_size = world_size // self.nnodes
1427
+ assert world_size % self.nnodes == 0, (
1428
+ f"world_size={world_size} must be divisible by nnodes={self.nnodes}."
1429
+ )
1430
+ assert self.node_rank < self.nnodes, (
1431
+ f"node_rank={self.node_rank} must be less than nnodes={self.nnodes}."
1432
+ )
1433
+ inferred_data_parallel_rank = (
1434
+ self.node_rank * local_world_size
1435
+ ) // world_size_within_dp
1436
+ if self.data_parallel_size > 1 and self.data_parallel_external_lb:
1437
+ self.data_parallel_rank = inferred_data_parallel_rank
1438
+ logger.info(
1439
+ "Inferred data_parallel_rank %d from node_rank %d for external lb",
1440
+ self.data_parallel_rank,
1441
+ self.node_rank,
1442
+ )
1443
+ elif self.data_parallel_size_local is None:
1444
+ # Infer data parallel size local for internal dplb:
1445
+ self.data_parallel_size_local = max(
1446
+ local_world_size // world_size_within_dp, 1
1447
+ )
1448
+ data_parallel_external_lb = (
1449
+ self.data_parallel_external_lb or self.data_parallel_rank is not None
1450
+ )
1451
+ # Local DP rank = 1, use pure-external LB.
1452
+ if data_parallel_external_lb:
1453
+ assert self.data_parallel_rank is not None, (
1454
+ "data_parallel_rank or node_rank must be specified if "
1455
+ "data_parallel_external_lb is enable."
1456
+ )
1457
+ assert self.data_parallel_size_local in (1, None), (
1458
+ "data_parallel_size_local must be 1 or None when data_parallel_rank "
1459
+ "is set"
1460
+ )
1461
+ data_parallel_size_local = 1
1462
+ # Use full external lb if we have local_size of 1.
1463
+ self.data_parallel_hybrid_lb = False
1464
+ elif self.data_parallel_size_local is not None:
1465
+ data_parallel_size_local = self.data_parallel_size_local
1466
+
1467
+ if self.data_parallel_start_rank and not headless:
1468
+ # Infer hybrid LB mode.
1469
+ self.data_parallel_hybrid_lb = True
1470
+
1471
+ if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
1472
+ # Use full external lb if we have local_size of 1.
1473
+ logger.warning(
1474
+ "data_parallel_hybrid_lb is not eligible when "
1475
+ "data_parallel_size_local = 1, autoswitch to "
1476
+ "data_parallel_external_lb."
1477
+ )
1478
+ data_parallel_external_lb = True
1479
+ self.data_parallel_hybrid_lb = False
1480
+
1481
+ if data_parallel_size_local == self.data_parallel_size:
1482
+ # Disable hybrid LB mode if set for a single node
1483
+ self.data_parallel_hybrid_lb = False
1484
+
1485
+ self.data_parallel_rank = (
1486
+ self.data_parallel_start_rank or inferred_data_parallel_rank
1487
+ )
1488
+ if self.nnodes > 1:
1489
+ logger.info(
1490
+ "Inferred data_parallel_rank %d from node_rank %d",
1491
+ self.data_parallel_rank,
1492
+ self.node_rank,
1493
+ )
1494
+ else:
1495
+ assert not self.data_parallel_hybrid_lb, (
1496
+ "data_parallel_size_local must be set to use data_parallel_hybrid_lb."
1497
+ )
1498
+
1499
+ if self.data_parallel_backend == "ray" and (
1500
+ envs.VLLM_RAY_DP_PACK_STRATEGY == "span"
1501
+ ):
1502
+ # Data parallel size defaults to 1 if DP ranks are spanning
1503
+ # multiple nodes
1504
+ data_parallel_size_local = 1
1505
+ else:
1506
+ # Otherwise local DP size defaults to global DP size if not set
1507
+ data_parallel_size_local = self.data_parallel_size
1508
+
1509
+ # DP address, used in multi-node case for torch distributed group
1510
+ # and ZMQ sockets.
1511
+ if self.data_parallel_address is None:
1512
+ if self.data_parallel_backend == "ray":
1513
+ host_ip = get_ip()
1514
+ logger.info(
1515
+ "Using host IP %s as ray-based data parallel address", host_ip
1516
+ )
1517
+ data_parallel_address = host_ip
1518
+ else:
1519
+ assert self.data_parallel_backend == "mp", (
1520
+ "data_parallel_backend can only be ray or mp, got %s",
1521
+ self.data_parallel_backend,
1522
+ )
1523
+ data_parallel_address = (
1524
+ self.master_addr or ParallelConfig.data_parallel_master_ip
1525
+ )
1526
+ else:
1527
+ data_parallel_address = self.data_parallel_address
1528
+
1529
+ # This port is only used when there are remote data parallel engines,
1530
+ # otherwise the local IPC transport is used.
1531
+ data_parallel_rpc_port = (
1532
+ self.data_parallel_rpc_port
1533
+ if (self.data_parallel_rpc_port is not None)
1534
+ else ParallelConfig.data_parallel_rpc_port
1535
+ )
1536
+
1537
+ if self.tokens_only and not model_config.skip_tokenizer_init:
1538
+ model_config.skip_tokenizer_init = True
1539
+ logger.info("Skipping tokenizer initialization for tokens-only mode.")
1540
+
1541
+ parallel_config = ParallelConfig(
1542
+ pipeline_parallel_size=self.pipeline_parallel_size,
1543
+ tensor_parallel_size=self.tensor_parallel_size,
1544
+ prefill_context_parallel_size=self.prefill_context_parallel_size,
1545
+ data_parallel_size=self.data_parallel_size,
1546
+ data_parallel_rank=self.data_parallel_rank or 0,
1547
+ data_parallel_external_lb=data_parallel_external_lb,
1548
+ data_parallel_size_local=data_parallel_size_local,
1549
+ master_addr=self.master_addr,
1550
+ master_port=self.master_port,
1551
+ nnodes=self.nnodes,
1552
+ node_rank=self.node_rank,
1553
+ data_parallel_master_ip=data_parallel_address,
1554
+ data_parallel_rpc_port=data_parallel_rpc_port,
1555
+ data_parallel_backend=self.data_parallel_backend,
1556
+ data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
1557
+ enable_expert_parallel=self.enable_expert_parallel,
1558
+ all2all_backend=self.all2all_backend,
1559
+ enable_dbo=self.enable_dbo,
1560
+ dbo_decode_token_threshold=self.dbo_decode_token_threshold,
1561
+ dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
1562
+ disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization,
1563
+ enable_eplb=self.enable_eplb,
1564
+ eplb_config=self.eplb_config,
1565
+ expert_placement_strategy=self.expert_placement_strategy,
1566
+ max_parallel_loading_workers=self.max_parallel_loading_workers,
1567
+ disable_custom_all_reduce=self.disable_custom_all_reduce,
1568
+ ray_workers_use_nsight=self.ray_workers_use_nsight,
1569
+ ray_runtime_env=ray_runtime_env,
1570
+ placement_group=placement_group,
1571
+ distributed_executor_backend=self.distributed_executor_backend,
1572
+ worker_cls=self.worker_cls,
1573
+ worker_extension_cls=self.worker_extension_cls,
1574
+ decode_context_parallel_size=self.decode_context_parallel_size,
1575
+ dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size,
1576
+ cp_kv_cache_interleave_size=self.cp_kv_cache_interleave_size,
1577
+ _api_process_count=self._api_process_count,
1578
+ _api_process_rank=self._api_process_rank,
1579
+ )
1580
+
1581
+ speculative_config = self.create_speculative_config(
1582
+ target_model_config=model_config,
1583
+ target_parallel_config=parallel_config,
1584
+ )
1585
+
1586
+ scheduler_config = SchedulerConfig(
1587
+ runner_type=model_config.runner_type,
1588
+ max_num_batched_tokens=self.max_num_batched_tokens,
1589
+ max_num_seqs=self.max_num_seqs,
1590
+ max_model_len=model_config.max_model_len,
1591
+ enable_chunked_prefill=self.enable_chunked_prefill,
1592
+ disable_chunked_mm_input=self.disable_chunked_mm_input,
1593
+ is_multimodal_model=model_config.is_multimodal_model,
1594
+ is_encoder_decoder=model_config.is_encoder_decoder,
1595
+ policy=self.scheduling_policy,
1596
+ scheduler_cls=self.scheduler_cls,
1597
+ max_num_partial_prefills=self.max_num_partial_prefills,
1598
+ max_long_partial_prefills=self.max_long_partial_prefills,
1599
+ long_prefill_token_threshold=self.long_prefill_token_threshold,
1600
+ disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
1601
+ async_scheduling=self.async_scheduling,
1602
+ stream_interval=self.stream_interval,
1603
+ )
1604
+
1605
+ if not model_config.is_multimodal_model and self.default_mm_loras:
1606
+ raise ValueError(
1607
+ "Default modality-specific LoRA(s) were provided for a "
1608
+ "non multimodal model"
1609
+ )
1610
+
1611
+ lora_config = (
1612
+ LoRAConfig(
1613
+ max_lora_rank=self.max_lora_rank,
1614
+ max_loras=self.max_loras,
1615
+ default_mm_loras=self.default_mm_loras,
1616
+ fully_sharded_loras=self.fully_sharded_loras,
1617
+ lora_dtype=self.lora_dtype,
1618
+ max_cpu_loras=self.max_cpu_loras
1619
+ if self.max_cpu_loras and self.max_cpu_loras > 0
1620
+ else None,
1621
+ )
1622
+ if self.enable_lora
1623
+ else None
1624
+ )
1625
+
1626
+ if (
1627
+ lora_config is not None
1628
+ and speculative_config is not None
1629
+ and scheduler_config.max_num_batched_tokens
1630
+ < (
1631
+ scheduler_config.max_num_seqs
1632
+ * (speculative_config.num_speculative_tokens + 1)
1633
+ )
1634
+ ):
1635
+ raise ValueError(
1636
+ "Consider increasing max_num_batched_tokens or "
1637
+ "decreasing num_speculative_tokens"
1638
+ )
1639
+
1640
+ # bitsandbytes pre-quantized model need a specific model loader
1641
+ if model_config.quantization == "bitsandbytes":
1642
+ self.quantization = self.load_format = "bitsandbytes"
1643
+
1644
+ # Attention config overrides
1645
+ attention_config = copy.deepcopy(self.attention_config)
1646
+ if self.attention_backend is not None:
1647
+ if attention_config.backend is not None:
1648
+ raise ValueError(
1649
+ "attention_backend and attention_config.backend "
1650
+ "are mutually exclusive"
1651
+ )
1652
+ # Convert string to enum if needed (CLI parsing returns a string)
1653
+ if isinstance(self.attention_backend, str):
1654
+ attention_config.backend = AttentionBackendEnum[
1655
+ self.attention_backend.upper()
1656
+ ]
1657
+ else:
1658
+ attention_config.backend = self.attention_backend
1659
+
1660
+ load_config = self.create_load_config()
1661
+
1662
+ # Pass reasoning_parser into StructuredOutputsConfig
1663
+ if self.reasoning_parser:
1664
+ self.structured_outputs_config.reasoning_parser = self.reasoning_parser
1665
+
1666
+ if self.reasoning_parser_plugin:
1667
+ self.structured_outputs_config.reasoning_parser_plugin = (
1668
+ self.reasoning_parser_plugin
1669
+ )
1670
+
1671
+ observability_config = ObservabilityConfig(
1672
+ show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
1673
+ otlp_traces_endpoint=self.otlp_traces_endpoint,
1674
+ collect_detailed_traces=self.collect_detailed_traces,
1675
+ kv_cache_metrics=self.kv_cache_metrics,
1676
+ kv_cache_metrics_sample=self.kv_cache_metrics_sample,
1677
+ cudagraph_metrics=self.cudagraph_metrics,
1678
+ enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
1679
+ )
1680
+
1681
+ # Compilation config overrides
1682
+ compilation_config = copy.deepcopy(self.compilation_config)
1683
+ if self.cudagraph_capture_sizes is not None:
1684
+ if compilation_config.cudagraph_capture_sizes is not None:
1685
+ raise ValueError(
1686
+ "cudagraph_capture_sizes and compilation_config."
1687
+ "cudagraph_capture_sizes are mutually exclusive"
1688
+ )
1689
+ compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
1690
+ if self.max_cudagraph_capture_size is not None:
1691
+ if compilation_config.max_cudagraph_capture_size is not None:
1692
+ raise ValueError(
1693
+ "max_cudagraph_capture_size and compilation_config."
1694
+ "max_cudagraph_capture_size are mutually exclusive"
1695
+ )
1696
+ compilation_config.max_cudagraph_capture_size = (
1697
+ self.max_cudagraph_capture_size
1698
+ )
1699
+ config = VllmConfig(
1700
+ model_config=model_config,
1701
+ cache_config=cache_config,
1702
+ parallel_config=parallel_config,
1703
+ scheduler_config=scheduler_config,
1704
+ device_config=device_config,
1705
+ load_config=load_config,
1706
+ attention_config=attention_config,
1707
+ lora_config=lora_config,
1708
+ speculative_config=speculative_config,
1709
+ structured_outputs_config=self.structured_outputs_config,
1710
+ observability_config=observability_config,
1711
+ compilation_config=compilation_config,
1712
+ kv_transfer_config=self.kv_transfer_config,
1713
+ kv_events_config=self.kv_events_config,
1714
+ ec_transfer_config=self.ec_transfer_config,
1715
+ profiler_config=self.profiler_config,
1716
+ additional_config=self.additional_config,
1717
+ optimization_level=self.optimization_level,
1718
+ )
1719
+
1720
+ return config
1721
+
1722
+ def _check_feature_supported(self, model_config: ModelConfig):
1723
+ """Raise an error if the feature is not supported."""
1724
+ if self.logits_processor_pattern != EngineArgs.logits_processor_pattern:
1725
+ _raise_unsupported_error(feature_name="--logits-processor-pattern")
1726
+
1727
+ # No Concurrent Partial Prefills so far.
1728
+ if (
1729
+ self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills
1730
+ or self.max_long_partial_prefills
1731
+ != SchedulerConfig.max_long_partial_prefills
1732
+ ):
1733
+ _raise_unsupported_error(feature_name="Concurrent Partial Prefill")
1734
+
1735
+ # N-gram, Medusa, and Eagle are supported for speculative decoding.
1736
+ if self.speculative_config is not None:
1737
+ # speculative_config could still be a dict at this point
1738
+ if isinstance(self.speculative_config, dict):
1739
+ method = self.speculative_config.get("method", None)
1740
+ else:
1741
+ method = self.speculative_config.method
1742
+
1743
+ if method == "draft_model":
1744
+ raise NotImplementedError(
1745
+ "Draft model speculative decoding is not supported yet. "
1746
+ "Please consider using other speculative decoding methods "
1747
+ "such as ngram, medusa, eagle, or mtp."
1748
+ )
1749
+
1750
+ if self.pipeline_parallel_size > 1:
1751
+ supports_pp = getattr(
1752
+ self.distributed_executor_backend, "supports_pp", False
1753
+ )
1754
+ if not supports_pp and self.distributed_executor_backend not in (
1755
+ ParallelConfig.distributed_executor_backend,
1756
+ "ray",
1757
+ "mp",
1758
+ "external_launcher",
1759
+ ):
1760
+ name = (
1761
+ "Pipeline Parallelism without Ray distributed "
1762
+ "executor or multiprocessing executor or external "
1763
+ "launcher"
1764
+ )
1765
+ _raise_unsupported_error(feature_name=name)
1766
+
1767
+ @classmethod
1768
+ def get_batch_defaults(
1769
+ cls,
1770
+ world_size: int,
1771
+ ) -> tuple[dict[UsageContext | None, int], dict[UsageContext | None, int]]:
1772
+ from vllm.usage.usage_lib import UsageContext
1773
+
1774
+ default_max_num_batched_tokens: dict[UsageContext | None, int]
1775
+ default_max_num_seqs: dict[UsageContext | None, int]
1776
+
1777
+ # When no user override, set the default values based on the usage
1778
+ # context.
1779
+ # Use different default values for different hardware.
1780
+
1781
+ # Try to query the device name on the current platform. If it fails,
1782
+ # it may be because the platform that imports vLLM is not the same
1783
+ # as the platform that vLLM is running on (e.g. the case of scaling
1784
+ # vLLM with Ray) and has no GPUs. In this case we use the default
1785
+ # values for non-H100/H200 GPUs.
1786
+ try:
1787
+ device_memory = current_platform.get_device_total_memory()
1788
+ device_name = current_platform.get_device_name().lower()
1789
+ except Exception:
1790
+ # This is only used to set default_max_num_batched_tokens
1791
+ device_memory = 0
1792
+ device_name = ""
1793
+
1794
+ # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
1795
+ # throughput, see PR #17885 for more details.
1796
+ # So here we do an extra device name check to prevent such regression.
1797
+ if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
1798
+ # For GPUs like H100 and MI300x, use larger default values.
1799
+ default_max_num_batched_tokens = {
1800
+ UsageContext.LLM_CLASS: 16384,
1801
+ UsageContext.OPENAI_API_SERVER: 8192,
1802
+ }
1803
+ default_max_num_seqs = {
1804
+ UsageContext.LLM_CLASS: 1024,
1805
+ UsageContext.OPENAI_API_SERVER: 1024,
1806
+ }
1807
+ else:
1808
+ # TODO(woosuk): Tune the default values for other hardware.
1809
+ default_max_num_batched_tokens = {
1810
+ UsageContext.LLM_CLASS: 8192,
1811
+ UsageContext.OPENAI_API_SERVER: 2048,
1812
+ }
1813
+ default_max_num_seqs = {
1814
+ UsageContext.LLM_CLASS: 256,
1815
+ UsageContext.OPENAI_API_SERVER: 256,
1816
+ }
1817
+
1818
+ # tpu specific default values.
1819
+ if current_platform.is_tpu():
1820
+ chip_name = current_platform.get_device_name()
1821
+
1822
+ if chip_name == "V6E":
1823
+ default_max_num_batched_tokens = {
1824
+ UsageContext.LLM_CLASS: 2048,
1825
+ UsageContext.OPENAI_API_SERVER: 1024,
1826
+ }
1827
+ elif chip_name == "V5E":
1828
+ default_max_num_batched_tokens = {
1829
+ UsageContext.LLM_CLASS: 1024,
1830
+ UsageContext.OPENAI_API_SERVER: 512,
1831
+ }
1832
+ elif chip_name == "V5P":
1833
+ default_max_num_batched_tokens = {
1834
+ UsageContext.LLM_CLASS: 512,
1835
+ UsageContext.OPENAI_API_SERVER: 256,
1836
+ }
1837
+
1838
+ # cpu specific default values.
1839
+ if current_platform.is_cpu():
1840
+ default_max_num_batched_tokens = {
1841
+ UsageContext.LLM_CLASS: 4096 * world_size,
1842
+ UsageContext.OPENAI_API_SERVER: 2048 * world_size,
1843
+ }
1844
+ default_max_num_seqs = {
1845
+ UsageContext.LLM_CLASS: 256 * world_size,
1846
+ UsageContext.OPENAI_API_SERVER: 128 * world_size,
1847
+ }
1848
+
1849
+ return default_max_num_batched_tokens, default_max_num_seqs
1850
+
1851
+ def _set_default_chunked_prefill_and_prefix_caching_args(
1852
+ self, model_config: ModelConfig
1853
+ ) -> None:
1854
+ default_chunked_prefill = model_config.is_chunked_prefill_supported
1855
+ default_prefix_caching = model_config.is_prefix_caching_supported
1856
+
1857
+ if self.enable_chunked_prefill is None:
1858
+ self.enable_chunked_prefill = default_chunked_prefill
1859
+
1860
+ logger.debug(
1861
+ "%s chunked prefill by default",
1862
+ "Enabling" if default_chunked_prefill else "Disabling",
1863
+ )
1864
+ elif (
1865
+ model_config.runner_type == "generate"
1866
+ and not self.enable_chunked_prefill
1867
+ and default_chunked_prefill
1868
+ ):
1869
+ logger.warning_once(
1870
+ "This model does not officially support disabling chunked prefill. "
1871
+ "Disabling this manually may cause the engine to crash "
1872
+ "or produce incorrect outputs.",
1873
+ scope="local",
1874
+ )
1875
+ elif (
1876
+ model_config.runner_type == "pooling"
1877
+ and self.enable_chunked_prefill
1878
+ and not default_chunked_prefill
1879
+ ):
1880
+ logger.warning_once(
1881
+ "This model does not officially support chunked prefill. "
1882
+ "Enabling this manually may cause the engine to crash "
1883
+ "or produce incorrect outputs.",
1884
+ scope="local",
1885
+ )
1886
+
1887
+ if self.enable_prefix_caching is None:
1888
+ self.enable_prefix_caching = default_prefix_caching
1889
+
1890
+ logger.debug(
1891
+ "%s prefix caching by default",
1892
+ "Enabling" if default_prefix_caching else "Disabling",
1893
+ )
1894
+ elif (
1895
+ model_config.runner_type == "pooling"
1896
+ and self.enable_prefix_caching
1897
+ and not default_prefix_caching
1898
+ ):
1899
+ logger.warning_once(
1900
+ "This model does not officially support prefix caching. "
1901
+ "Enabling this manually may cause the engine to crash "
1902
+ "or produce incorrect outputs.",
1903
+ scope="local",
1904
+ )
1905
+
1906
+ # Disable chunked prefill and prefix caching for:
1907
+ # POWER (ppc64le)/s390x/RISCV CPUs in V1
1908
+ if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
1909
+ CpuArchEnum.POWERPC,
1910
+ CpuArchEnum.S390X,
1911
+ CpuArchEnum.RISCV,
1912
+ ):
1913
+ logger.info(
1914
+ "Chunked prefill is not supported for ARM and POWER, "
1915
+ "S390X and RISC-V CPUs; "
1916
+ "disabling it for V1 backend."
1917
+ )
1918
+ self.enable_chunked_prefill = False
1919
+ logger.info(
1920
+ "Prefix caching is not supported for ARM and POWER, "
1921
+ "S390X and RISC-V CPUs; "
1922
+ "disabling it for V1 backend."
1923
+ )
1924
+ self.enable_prefix_caching = False
1925
+
1926
+ def _set_default_max_num_seqs_and_batched_tokens_args(
1927
+ self,
1928
+ usage_context: UsageContext | None,
1929
+ model_config: ModelConfig,
1930
+ ):
1931
+ world_size = self.pipeline_parallel_size * self.tensor_parallel_size
1932
+ (
1933
+ default_max_num_batched_tokens,
1934
+ default_max_num_seqs,
1935
+ ) = self.get_batch_defaults(world_size)
1936
+
1937
+ orig_max_num_batched_tokens = self.max_num_batched_tokens
1938
+ orig_max_num_seqs = self.max_num_seqs
1939
+
1940
+ if self.max_num_batched_tokens is None:
1941
+ self.max_num_batched_tokens = default_max_num_batched_tokens.get(
1942
+ usage_context,
1943
+ SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
1944
+ )
1945
+
1946
+ if self.max_num_seqs is None:
1947
+ self.max_num_seqs = default_max_num_seqs.get(
1948
+ usage_context,
1949
+ SchedulerConfig.DEFAULT_MAX_NUM_SEQS,
1950
+ )
1951
+
1952
+ if orig_max_num_batched_tokens is None:
1953
+ if not self.enable_chunked_prefill:
1954
+ # If max_model_len is too short, use the default for higher throughput.
1955
+ self.max_num_batched_tokens = max(
1956
+ model_config.max_model_len,
1957
+ self.max_num_batched_tokens,
1958
+ )
1959
+
1960
+ # When using default settings,
1961
+ # Ensure max_num_batched_tokens does not exceed model limit.
1962
+ # Some models (e.g., Whisper) have embeddings tied to max length.
1963
+ self.max_num_batched_tokens = min(
1964
+ self.max_num_seqs * model_config.max_model_len,
1965
+ self.max_num_batched_tokens,
1966
+ )
1967
+
1968
+ logger.debug(
1969
+ "Defaulting max_num_batched_tokens to %d for %s usage context.",
1970
+ self.max_num_batched_tokens,
1971
+ usage_context.value if usage_context else None,
1972
+ )
1973
+
1974
+ if orig_max_num_seqs is None:
1975
+ assert self.max_num_batched_tokens is not None # For type checking
1976
+ self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)
1977
+
1978
+ logger.debug(
1979
+ "Defaulting max_num_seqs to %d for %s usage context.",
1980
+ self.max_num_seqs,
1981
+ usage_context.value if usage_context else None,
1982
+ )
1983
+
1984
+
1985
+ @dataclass
1986
+ class AsyncEngineArgs(EngineArgs):
1987
+ """Arguments for asynchronous vLLM engine."""
1988
+
1989
+ enable_log_requests: bool = False
1990
+
1991
+ @staticmethod
1992
+ def add_cli_args(
1993
+ parser: FlexibleArgumentParser, async_args_only: bool = False
1994
+ ) -> FlexibleArgumentParser:
1995
+ # Initialize plugin to update the parser, for example, The plugin may
1996
+ # add a new kind of quantization method to --quantization argument or
1997
+ # a new device to --device argument.
1998
+ load_general_plugins()
1999
+ if not async_args_only:
2000
+ parser = EngineArgs.add_cli_args(parser)
2001
+ parser.add_argument(
2002
+ "--enable-log-requests",
2003
+ action=argparse.BooleanOptionalAction,
2004
+ default=AsyncEngineArgs.enable_log_requests,
2005
+ help="Enable logging requests.",
2006
+ )
2007
+ parser.add_argument(
2008
+ "--disable-log-requests",
2009
+ action=argparse.BooleanOptionalAction,
2010
+ default=not AsyncEngineArgs.enable_log_requests,
2011
+ help="[DEPRECATED] Disable logging requests.",
2012
+ deprecated=True,
2013
+ )
2014
+ current_platform.pre_register_and_update(parser)
2015
+ return parser
2016
+
2017
+
2018
+ def _raise_unsupported_error(feature_name: str):
2019
+ msg = (
2020
+ f"{feature_name} is not supported. We recommend to "
2021
+ f"remove {feature_name} from your config."
2022
+ )
2023
+ raise NotImplementedError(msg)
2024
+
2025
+
2026
+ def human_readable_int(value):
2027
+ """Parse human-readable integers like '1k', '2M', etc.
2028
+ Including decimal values with decimal multipliers.
2029
+
2030
+ Examples:
2031
+ - '1k' -> 1,000
2032
+ - '1K' -> 1,024
2033
+ - '25.6k' -> 25,600
2034
+ """
2035
+ value = value.strip()
2036
+ match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
2037
+ if match:
2038
+ decimal_multiplier = {
2039
+ "k": 10**3,
2040
+ "m": 10**6,
2041
+ "g": 10**9,
2042
+ "t": 10**12,
2043
+ }
2044
+ binary_multiplier = {
2045
+ "K": 2**10,
2046
+ "M": 2**20,
2047
+ "G": 2**30,
2048
+ "T": 2**40,
2049
+ }
2050
+
2051
+ number, suffix = match.groups()
2052
+ if suffix in decimal_multiplier:
2053
+ mult = decimal_multiplier[suffix]
2054
+ return int(float(number) * mult)
2055
+ elif suffix in binary_multiplier:
2056
+ mult = binary_multiplier[suffix]
2057
+ # Do not allow decimals with binary multipliers
2058
+ try:
2059
+ return int(number) * mult
2060
+ except ValueError as e:
2061
+ raise argparse.ArgumentTypeError(
2062
+ "Decimals are not allowed "
2063
+ f"with binary suffixes like {suffix}. Did you mean to use "
2064
+ f"{number}{suffix.lower()} instead?"
2065
+ ) from e
2066
+
2067
+ # Regular plain number.
2068
+ return int(value)