vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1536) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +983 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +2863 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +18 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +391 -0
  16. vllm/attention/backends/registry.py +195 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +1052 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +121 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/ops/__init__.py +0 -0
  24. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  25. vllm/attention/ops/common.py +414 -0
  26. vllm/attention/ops/flashmla.py +251 -0
  27. vllm/attention/ops/merge_attn_states.py +47 -0
  28. vllm/attention/ops/paged_attn.py +262 -0
  29. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  30. vllm/attention/ops/prefix_prefill.py +814 -0
  31. vllm/attention/ops/rocm_aiter_paged_attn.py +123 -0
  32. vllm/attention/ops/triton_decode_attention.py +712 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +105 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  35. vllm/attention/ops/triton_unified_attention.py +941 -0
  36. vllm/attention/ops/vit_attn_wrappers.py +178 -0
  37. vllm/attention/selector.py +231 -0
  38. vllm/attention/utils/__init__.py +0 -0
  39. vllm/attention/utils/fa_utils.py +109 -0
  40. vllm/attention/utils/kv_sharing_utils.py +33 -0
  41. vllm/attention/utils/kv_transfer_utils.py +60 -0
  42. vllm/beam_search.py +88 -0
  43. vllm/benchmarks/__init__.py +0 -0
  44. vllm/benchmarks/datasets.py +3222 -0
  45. vllm/benchmarks/latency.py +172 -0
  46. vllm/benchmarks/lib/__init__.py +3 -0
  47. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  48. vllm/benchmarks/lib/ready_checker.py +72 -0
  49. vllm/benchmarks/lib/utils.py +79 -0
  50. vllm/benchmarks/serve.py +1531 -0
  51. vllm/benchmarks/sweep/__init__.py +0 -0
  52. vllm/benchmarks/sweep/cli.py +38 -0
  53. vllm/benchmarks/sweep/param_sweep.py +91 -0
  54. vllm/benchmarks/sweep/plot.py +580 -0
  55. vllm/benchmarks/sweep/serve.py +416 -0
  56. vllm/benchmarks/sweep/serve_sla.py +492 -0
  57. vllm/benchmarks/sweep/server.py +114 -0
  58. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  59. vllm/benchmarks/sweep/utils.py +4 -0
  60. vllm/benchmarks/throughput.py +799 -0
  61. vllm/collect_env.py +857 -0
  62. vllm/compilation/__init__.py +0 -0
  63. vllm/compilation/activation_quant_fusion.py +209 -0
  64. vllm/compilation/backends.py +759 -0
  65. vllm/compilation/base_static_graph.py +57 -0
  66. vllm/compilation/caching.py +178 -0
  67. vllm/compilation/collective_fusion.py +1234 -0
  68. vllm/compilation/compiler_interface.py +639 -0
  69. vllm/compilation/counter.py +48 -0
  70. vllm/compilation/cuda_graph.py +208 -0
  71. vllm/compilation/decorators.py +571 -0
  72. vllm/compilation/fix_functionalization.py +253 -0
  73. vllm/compilation/fusion.py +374 -0
  74. vllm/compilation/fusion_attn.py +359 -0
  75. vllm/compilation/fx_utils.py +91 -0
  76. vllm/compilation/inductor_pass.py +133 -0
  77. vllm/compilation/matcher_utils.py +317 -0
  78. vllm/compilation/monitor.py +62 -0
  79. vllm/compilation/noop_elimination.py +134 -0
  80. vllm/compilation/partition_rules.py +72 -0
  81. vllm/compilation/pass_manager.py +135 -0
  82. vllm/compilation/piecewise_backend.py +121 -0
  83. vllm/compilation/post_cleanup.py +21 -0
  84. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  85. vllm/compilation/sequence_parallelism.py +363 -0
  86. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  87. vllm/compilation/vllm_inductor_pass.py +173 -0
  88. vllm/compilation/wrapper.py +238 -0
  89. vllm/config/__init__.py +102 -0
  90. vllm/config/cache.py +207 -0
  91. vllm/config/compilation.py +975 -0
  92. vllm/config/device.py +75 -0
  93. vllm/config/ec_transfer.py +110 -0
  94. vllm/config/kv_events.py +56 -0
  95. vllm/config/kv_transfer.py +114 -0
  96. vllm/config/load.py +124 -0
  97. vllm/config/lora.py +112 -0
  98. vllm/config/model.py +2162 -0
  99. vllm/config/multimodal.py +248 -0
  100. vllm/config/observability.py +123 -0
  101. vllm/config/parallel.py +655 -0
  102. vllm/config/pooler.py +122 -0
  103. vllm/config/scheduler.py +298 -0
  104. vllm/config/speculative.py +654 -0
  105. vllm/config/speech_to_text.py +38 -0
  106. vllm/config/structured_outputs.py +92 -0
  107. vllm/config/utils.py +178 -0
  108. vllm/config/vllm.py +1166 -0
  109. vllm/connections.py +189 -0
  110. vllm/device_allocator/__init__.py +0 -0
  111. vllm/device_allocator/cumem.py +327 -0
  112. vllm/distributed/__init__.py +6 -0
  113. vllm/distributed/communication_op.py +43 -0
  114. vllm/distributed/device_communicators/__init__.py +0 -0
  115. vllm/distributed/device_communicators/all2all.py +490 -0
  116. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  117. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  118. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  119. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  120. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  121. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  122. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  123. vllm/distributed/device_communicators/pynccl.py +386 -0
  124. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  125. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  126. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  127. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  128. vllm/distributed/device_communicators/shm_broadcast.py +733 -0
  129. vllm/distributed/device_communicators/shm_object_storage.py +660 -0
  130. vllm/distributed/device_communicators/symm_mem.py +156 -0
  131. vllm/distributed/device_communicators/tpu_communicator.py +107 -0
  132. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  133. vllm/distributed/ec_transfer/__init__.py +14 -0
  134. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  135. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  136. vllm/distributed/ec_transfer/ec_connector/factory.py +88 -0
  137. vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +201 -0
  138. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  139. vllm/distributed/eplb/__init__.py +8 -0
  140. vllm/distributed/eplb/eplb_state.py +837 -0
  141. vllm/distributed/eplb/rebalance_algo.py +260 -0
  142. vllm/distributed/eplb/rebalance_execute.py +431 -0
  143. vllm/distributed/kv_events.py +371 -0
  144. vllm/distributed/kv_transfer/README.md +29 -0
  145. vllm/distributed/kv_transfer/__init__.py +20 -0
  146. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  147. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  149. vllm/distributed/kv_transfer/kv_connector/factory.py +192 -0
  150. vllm/distributed/kv_transfer/kv_connector/utils.py +268 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/base.py +546 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +216 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +379 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1411 -0
  159. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +867 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +189 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +454 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2440 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +504 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +450 -0
  169. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  170. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +179 -0
  171. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +164 -0
  172. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +242 -0
  173. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  174. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  175. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +295 -0
  176. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +285 -0
  177. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  178. vllm/distributed/parallel_state.py +1759 -0
  179. vllm/distributed/tpu_distributed_utils.py +188 -0
  180. vllm/distributed/utils.py +543 -0
  181. vllm/engine/__init__.py +0 -0
  182. vllm/engine/arg_utils.py +2144 -0
  183. vllm/engine/async_llm_engine.py +6 -0
  184. vllm/engine/llm_engine.py +6 -0
  185. vllm/engine/protocol.py +170 -0
  186. vllm/entrypoints/__init__.py +0 -0
  187. vllm/entrypoints/anthropic/__init__.py +0 -0
  188. vllm/entrypoints/anthropic/protocol.py +162 -0
  189. vllm/entrypoints/anthropic/serving_messages.py +460 -0
  190. vllm/entrypoints/api_server.py +184 -0
  191. vllm/entrypoints/chat_utils.py +1690 -0
  192. vllm/entrypoints/cli/__init__.py +13 -0
  193. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  194. vllm/entrypoints/cli/benchmark/base.py +25 -0
  195. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  196. vllm/entrypoints/cli/benchmark/main.py +56 -0
  197. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  198. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  199. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  200. vllm/entrypoints/cli/collect_env.py +38 -0
  201. vllm/entrypoints/cli/main.py +79 -0
  202. vllm/entrypoints/cli/openai.py +256 -0
  203. vllm/entrypoints/cli/run_batch.py +68 -0
  204. vllm/entrypoints/cli/serve.py +249 -0
  205. vllm/entrypoints/cli/types.py +29 -0
  206. vllm/entrypoints/constants.py +10 -0
  207. vllm/entrypoints/context.py +572 -0
  208. vllm/entrypoints/dynamic_lora.py +57 -0
  209. vllm/entrypoints/harmony_utils.py +535 -0
  210. vllm/entrypoints/launcher.py +175 -0
  211. vllm/entrypoints/llm.py +1768 -0
  212. vllm/entrypoints/logger.py +84 -0
  213. vllm/entrypoints/openai/__init__.py +0 -0
  214. vllm/entrypoints/openai/api_server.py +2096 -0
  215. vllm/entrypoints/openai/cli_args.py +302 -0
  216. vllm/entrypoints/openai/orca_metrics.py +120 -0
  217. vllm/entrypoints/openai/protocol.py +3299 -0
  218. vllm/entrypoints/openai/run_batch.py +547 -0
  219. vllm/entrypoints/openai/serving_chat.py +1772 -0
  220. vllm/entrypoints/openai/serving_classification.py +235 -0
  221. vllm/entrypoints/openai/serving_completion.py +715 -0
  222. vllm/entrypoints/openai/serving_embedding.py +695 -0
  223. vllm/entrypoints/openai/serving_engine.py +1433 -0
  224. vllm/entrypoints/openai/serving_models.py +304 -0
  225. vllm/entrypoints/openai/serving_pooling.py +346 -0
  226. vllm/entrypoints/openai/serving_responses.py +2021 -0
  227. vllm/entrypoints/openai/serving_score.py +503 -0
  228. vllm/entrypoints/openai/serving_tokenization.py +203 -0
  229. vllm/entrypoints/openai/serving_tokens.py +269 -0
  230. vllm/entrypoints/openai/serving_transcription.py +148 -0
  231. vllm/entrypoints/openai/speech_to_text.py +405 -0
  232. vllm/entrypoints/openai/tool_parsers/__init__.py +142 -0
  233. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +273 -0
  234. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +390 -0
  235. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +390 -0
  236. vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +210 -0
  237. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +200 -0
  238. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  239. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +253 -0
  240. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +494 -0
  241. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  242. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +227 -0
  243. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +323 -0
  244. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +590 -0
  245. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  246. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +290 -0
  247. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +37 -0
  248. vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +643 -0
  249. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +849 -0
  250. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +390 -0
  251. vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py +366 -0
  252. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +97 -0
  253. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +120 -0
  254. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +332 -0
  255. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +781 -0
  256. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  257. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +744 -0
  258. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +303 -0
  259. vllm/entrypoints/openai/tool_parsers/utils.py +229 -0
  260. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +556 -0
  261. vllm/entrypoints/renderer.py +409 -0
  262. vllm/entrypoints/responses_utils.py +77 -0
  263. vllm/entrypoints/sagemaker/__init__.py +4 -0
  264. vllm/entrypoints/sagemaker/routes.py +72 -0
  265. vllm/entrypoints/score_utils.py +242 -0
  266. vllm/entrypoints/ssl.py +78 -0
  267. vllm/entrypoints/tool.py +143 -0
  268. vllm/entrypoints/tool_server.py +209 -0
  269. vllm/entrypoints/utils.py +319 -0
  270. vllm/env_override.py +378 -0
  271. vllm/envs.py +1659 -0
  272. vllm/forward_context.py +356 -0
  273. vllm/inputs/__init__.py +44 -0
  274. vllm/inputs/data.py +359 -0
  275. vllm/inputs/parse.py +137 -0
  276. vllm/inputs/preprocess.py +727 -0
  277. vllm/logger.py +267 -0
  278. vllm/logging_utils/__init__.py +10 -0
  279. vllm/logging_utils/dump_input.py +83 -0
  280. vllm/logging_utils/formatter.py +77 -0
  281. vllm/logging_utils/log_time.py +34 -0
  282. vllm/logits_process.py +121 -0
  283. vllm/logprobs.py +208 -0
  284. vllm/lora/__init__.py +0 -0
  285. vllm/lora/layers/__init__.py +41 -0
  286. vllm/lora/layers/base.py +67 -0
  287. vllm/lora/layers/base_linear.py +164 -0
  288. vllm/lora/layers/column_parallel_linear.py +578 -0
  289. vllm/lora/layers/fused_moe.py +472 -0
  290. vllm/lora/layers/logits_processor.py +252 -0
  291. vllm/lora/layers/replicated_linear.py +70 -0
  292. vllm/lora/layers/row_parallel_linear.py +181 -0
  293. vllm/lora/layers/utils.py +65 -0
  294. vllm/lora/layers/vocal_parallel_embedding.py +166 -0
  295. vllm/lora/lora_weights.py +198 -0
  296. vllm/lora/models.py +890 -0
  297. vllm/lora/ops/__init__.py +0 -0
  298. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  299. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  300. vllm/lora/ops/torch_ops/__init__.py +20 -0
  301. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  302. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  303. vllm/lora/ops/triton_ops/__init__.py +21 -0
  304. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +641 -0
  305. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  306. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  307. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  308. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  309. vllm/lora/ops/triton_ops/utils.py +295 -0
  310. vllm/lora/ops/xla_ops/__init__.py +6 -0
  311. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  312. vllm/lora/peft_helper.py +128 -0
  313. vllm/lora/punica_wrapper/__init__.py +10 -0
  314. vllm/lora/punica_wrapper/punica_base.py +492 -0
  315. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  316. vllm/lora/punica_wrapper/punica_gpu.py +411 -0
  317. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  318. vllm/lora/punica_wrapper/punica_tpu.py +359 -0
  319. vllm/lora/punica_wrapper/punica_xpu.py +279 -0
  320. vllm/lora/punica_wrapper/utils.py +150 -0
  321. vllm/lora/request.py +100 -0
  322. vllm/lora/resolver.py +88 -0
  323. vllm/lora/utils.py +293 -0
  324. vllm/lora/worker_manager.py +279 -0
  325. vllm/model_executor/__init__.py +11 -0
  326. vllm/model_executor/custom_op.py +194 -0
  327. vllm/model_executor/layers/__init__.py +0 -0
  328. vllm/model_executor/layers/activation.py +569 -0
  329. vllm/model_executor/layers/attention_layer_base.py +35 -0
  330. vllm/model_executor/layers/batch_invariant.py +854 -0
  331. vllm/model_executor/layers/conv.py +236 -0
  332. vllm/model_executor/layers/fla/__init__.py +8 -0
  333. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  334. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  335. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  336. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  337. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  338. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  339. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  340. vllm/model_executor/layers/fla/ops/index.py +41 -0
  341. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  342. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  343. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  344. vllm/model_executor/layers/fla/ops/op.py +60 -0
  345. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  346. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  347. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  348. vllm/model_executor/layers/fused_moe/__init__.py +106 -0
  349. vllm/model_executor/layers/fused_moe/all2all_utils.py +160 -0
  350. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +406 -0
  351. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +180 -0
  352. vllm/model_executor/layers/fused_moe/config.py +916 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  624. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  625. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +354 -0
  626. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1052 -0
  627. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +387 -0
  628. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +416 -0
  629. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  630. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +367 -0
  631. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  632. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  633. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  634. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  635. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +792 -0
  636. vllm/model_executor/layers/fused_moe/fused_moe.py +2175 -0
  637. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +112 -0
  638. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +164 -0
  639. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +316 -0
  640. vllm/model_executor/layers/fused_moe/layer.py +1944 -0
  641. vllm/model_executor/layers/fused_moe/modular_kernel.py +1222 -0
  642. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +174 -0
  643. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  644. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  645. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  646. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  647. vllm/model_executor/layers/fused_moe/prepare_finalize.py +77 -0
  648. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  649. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  650. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +97 -0
  651. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  652. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  653. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  654. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +578 -0
  655. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  656. vllm/model_executor/layers/kda.py +448 -0
  657. vllm/model_executor/layers/layernorm.py +442 -0
  658. vllm/model_executor/layers/lightning_attn.py +729 -0
  659. vllm/model_executor/layers/linear.py +1424 -0
  660. vllm/model_executor/layers/logits_processor.py +106 -0
  661. vllm/model_executor/layers/mamba/__init__.py +0 -0
  662. vllm/model_executor/layers/mamba/abstract.py +71 -0
  663. vllm/model_executor/layers/mamba/linear_attn.py +402 -0
  664. vllm/model_executor/layers/mamba/mamba_mixer.py +535 -0
  665. vllm/model_executor/layers/mamba/mamba_mixer2.py +928 -0
  666. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  667. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  668. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  669. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  670. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +478 -0
  671. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  672. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  673. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  674. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  675. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  676. vllm/model_executor/layers/mamba/short_conv.py +264 -0
  677. vllm/model_executor/layers/mla.py +168 -0
  678. vllm/model_executor/layers/pooler.py +817 -0
  679. vllm/model_executor/layers/quantization/__init__.py +174 -0
  680. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  681. vllm/model_executor/layers/quantization/awq.py +277 -0
  682. vllm/model_executor/layers/quantization/awq_marlin.py +659 -0
  683. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  684. vllm/model_executor/layers/quantization/base_config.py +170 -0
  685. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  686. vllm/model_executor/layers/quantization/bitsandbytes.py +658 -0
  687. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  688. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +914 -0
  689. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2284 -0
  690. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  691. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  692. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  693. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  694. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  695. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  696. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +183 -0
  697. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  698. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  699. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  700. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  701. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +219 -0
  702. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  703. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  704. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  705. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  706. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  707. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  710. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  711. vllm/model_executor/layers/quantization/experts_int8.py +240 -0
  712. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  713. vllm/model_executor/layers/quantization/fp8.py +1333 -0
  714. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  715. vllm/model_executor/layers/quantization/gguf.py +643 -0
  716. vllm/model_executor/layers/quantization/gptq.py +393 -0
  717. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  718. vllm/model_executor/layers/quantization/gptq_marlin.py +789 -0
  719. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  720. vllm/model_executor/layers/quantization/hqq_marlin.py +371 -0
  721. vllm/model_executor/layers/quantization/inc.py +65 -0
  722. vllm/model_executor/layers/quantization/input_quant_fp8.py +171 -0
  723. vllm/model_executor/layers/quantization/ipex_quant.py +467 -0
  724. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  725. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  726. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +105 -0
  727. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  728. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  729. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  730. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +119 -0
  731. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  732. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  733. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  734. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +166 -0
  735. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +73 -0
  736. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +97 -0
  737. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  738. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +219 -0
  739. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +140 -0
  740. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +42 -0
  741. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  742. vllm/model_executor/layers/quantization/kv_cache.py +146 -0
  743. vllm/model_executor/layers/quantization/modelopt.py +1788 -0
  744. vllm/model_executor/layers/quantization/moe_wna16.py +541 -0
  745. vllm/model_executor/layers/quantization/mxfp4.py +1162 -0
  746. vllm/model_executor/layers/quantization/petit.py +320 -0
  747. vllm/model_executor/layers/quantization/ptpc_fp8.py +137 -0
  748. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  749. vllm/model_executor/layers/quantization/quark/quark.py +528 -0
  750. vllm/model_executor/layers/quantization/quark/quark_moe.py +683 -0
  751. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  752. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +306 -0
  753. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  754. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  755. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  756. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  757. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  758. vllm/model_executor/layers/quantization/rtn.py +652 -0
  759. vllm/model_executor/layers/quantization/schema.py +90 -0
  760. vllm/model_executor/layers/quantization/torchao.py +380 -0
  761. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  762. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  763. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  764. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  976. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +89 -0
  977. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +298 -0
  978. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1203 -0
  979. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  980. vllm/model_executor/layers/quantization/utils/int8_utils.py +489 -0
  981. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  982. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  983. vllm/model_executor/layers/quantization/utils/marlin_utils.py +575 -0
  984. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +397 -0
  985. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +351 -0
  986. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +161 -0
  987. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  988. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +181 -0
  989. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  990. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  991. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  992. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +63 -0
  993. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  994. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  995. vllm/model_executor/layers/quantization/utils/quant_utils.py +687 -0
  996. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +516 -0
  997. vllm/model_executor/layers/resampler.py +283 -0
  998. vllm/model_executor/layers/rotary_embedding/__init__.py +278 -0
  999. vllm/model_executor/layers/rotary_embedding/base.py +235 -0
  1000. vllm/model_executor/layers/rotary_embedding/common.py +188 -0
  1001. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1002. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1003. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1004. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1005. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +75 -0
  1006. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1007. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1008. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1009. vllm/model_executor/layers/rotary_embedding/mrope.py +397 -0
  1010. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1011. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1012. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +81 -0
  1013. vllm/model_executor/layers/utils.py +251 -0
  1014. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1015. vllm/model_executor/model_loader/__init__.py +148 -0
  1016. vllm/model_executor/model_loader/base_loader.py +57 -0
  1017. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1018. vllm/model_executor/model_loader/default_loader.py +327 -0
  1019. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1020. vllm/model_executor/model_loader/gguf_loader.py +176 -0
  1021. vllm/model_executor/model_loader/online_quantization.py +224 -0
  1022. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1023. vllm/model_executor/model_loader/sharded_state_loader.py +206 -0
  1024. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1025. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1026. vllm/model_executor/model_loader/tpu.py +118 -0
  1027. vllm/model_executor/model_loader/utils.py +288 -0
  1028. vllm/model_executor/model_loader/weight_utils.py +1084 -0
  1029. vllm/model_executor/models/__init__.py +44 -0
  1030. vllm/model_executor/models/adapters.py +543 -0
  1031. vllm/model_executor/models/afmoe.py +711 -0
  1032. vllm/model_executor/models/aimv2.py +247 -0
  1033. vllm/model_executor/models/apertus.py +587 -0
  1034. vllm/model_executor/models/arcee.py +439 -0
  1035. vllm/model_executor/models/arctic.py +635 -0
  1036. vllm/model_executor/models/aria.py +655 -0
  1037. vllm/model_executor/models/aya_vision.py +450 -0
  1038. vllm/model_executor/models/baichuan.py +496 -0
  1039. vllm/model_executor/models/bailing_moe.py +646 -0
  1040. vllm/model_executor/models/bamba.py +522 -0
  1041. vllm/model_executor/models/bee.py +157 -0
  1042. vllm/model_executor/models/bert.py +925 -0
  1043. vllm/model_executor/models/bert_with_rope.py +732 -0
  1044. vllm/model_executor/models/blip.py +349 -0
  1045. vllm/model_executor/models/blip2.py +695 -0
  1046. vllm/model_executor/models/bloom.py +390 -0
  1047. vllm/model_executor/models/chameleon.py +1120 -0
  1048. vllm/model_executor/models/chatglm.py +498 -0
  1049. vllm/model_executor/models/clip.py +965 -0
  1050. vllm/model_executor/models/cohere2_vision.py +472 -0
  1051. vllm/model_executor/models/commandr.py +473 -0
  1052. vllm/model_executor/models/config.py +503 -0
  1053. vllm/model_executor/models/dbrx.py +482 -0
  1054. vllm/model_executor/models/deepencoder.py +673 -0
  1055. vllm/model_executor/models/deepseek_eagle.py +260 -0
  1056. vllm/model_executor/models/deepseek_mtp.py +360 -0
  1057. vllm/model_executor/models/deepseek_ocr.py +593 -0
  1058. vllm/model_executor/models/deepseek_v2.py +1649 -0
  1059. vllm/model_executor/models/deepseek_vl2.py +655 -0
  1060. vllm/model_executor/models/dots1.py +574 -0
  1061. vllm/model_executor/models/dots_ocr.py +900 -0
  1062. vllm/model_executor/models/ernie45.py +53 -0
  1063. vllm/model_executor/models/ernie45_moe.py +759 -0
  1064. vllm/model_executor/models/ernie45_vl.py +1742 -0
  1065. vllm/model_executor/models/ernie45_vl_moe.py +803 -0
  1066. vllm/model_executor/models/ernie_mtp.py +279 -0
  1067. vllm/model_executor/models/exaone.py +545 -0
  1068. vllm/model_executor/models/exaone4.py +531 -0
  1069. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1070. vllm/model_executor/models/falcon.py +545 -0
  1071. vllm/model_executor/models/falcon_h1.py +685 -0
  1072. vllm/model_executor/models/flex_olmo.py +155 -0
  1073. vllm/model_executor/models/fuyu.py +373 -0
  1074. vllm/model_executor/models/gemma.py +426 -0
  1075. vllm/model_executor/models/gemma2.py +439 -0
  1076. vllm/model_executor/models/gemma3.py +571 -0
  1077. vllm/model_executor/models/gemma3_mm.py +741 -0
  1078. vllm/model_executor/models/gemma3n.py +1165 -0
  1079. vllm/model_executor/models/gemma3n_mm.py +811 -0
  1080. vllm/model_executor/models/glm.py +23 -0
  1081. vllm/model_executor/models/glm4.py +305 -0
  1082. vllm/model_executor/models/glm4_1v.py +1821 -0
  1083. vllm/model_executor/models/glm4_moe.py +747 -0
  1084. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1085. vllm/model_executor/models/glm4v.py +784 -0
  1086. vllm/model_executor/models/gpt2.py +397 -0
  1087. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1088. vllm/model_executor/models/gpt_j.py +346 -0
  1089. vllm/model_executor/models/gpt_neox.py +344 -0
  1090. vllm/model_executor/models/gpt_oss.py +738 -0
  1091. vllm/model_executor/models/granite.py +516 -0
  1092. vllm/model_executor/models/granite_speech.py +913 -0
  1093. vllm/model_executor/models/granitemoe.py +569 -0
  1094. vllm/model_executor/models/granitemoehybrid.py +709 -0
  1095. vllm/model_executor/models/granitemoeshared.py +333 -0
  1096. vllm/model_executor/models/gritlm.py +245 -0
  1097. vllm/model_executor/models/grok1.py +558 -0
  1098. vllm/model_executor/models/h2ovl.py +554 -0
  1099. vllm/model_executor/models/hunyuan_v1.py +1053 -0
  1100. vllm/model_executor/models/hyperclovax_vision.py +1166 -0
  1101. vllm/model_executor/models/idefics2_vision_model.py +426 -0
  1102. vllm/model_executor/models/idefics3.py +717 -0
  1103. vllm/model_executor/models/interfaces.py +1092 -0
  1104. vllm/model_executor/models/interfaces_base.py +214 -0
  1105. vllm/model_executor/models/intern_vit.py +453 -0
  1106. vllm/model_executor/models/internlm2.py +460 -0
  1107. vllm/model_executor/models/internlm2_ve.py +142 -0
  1108. vllm/model_executor/models/interns1.py +830 -0
  1109. vllm/model_executor/models/interns1_vit.py +432 -0
  1110. vllm/model_executor/models/internvl.py +1452 -0
  1111. vllm/model_executor/models/jais.py +397 -0
  1112. vllm/model_executor/models/jamba.py +610 -0
  1113. vllm/model_executor/models/jina_vl.py +147 -0
  1114. vllm/model_executor/models/keye.py +1761 -0
  1115. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1116. vllm/model_executor/models/kimi_linear.py +663 -0
  1117. vllm/model_executor/models/kimi_vl.py +578 -0
  1118. vllm/model_executor/models/lfm2.py +532 -0
  1119. vllm/model_executor/models/lfm2_moe.py +762 -0
  1120. vllm/model_executor/models/lightonocr.py +195 -0
  1121. vllm/model_executor/models/llama.py +732 -0
  1122. vllm/model_executor/models/llama4.py +859 -0
  1123. vllm/model_executor/models/llama4_eagle.py +223 -0
  1124. vllm/model_executor/models/llama_eagle.py +218 -0
  1125. vllm/model_executor/models/llama_eagle3.py +367 -0
  1126. vllm/model_executor/models/llava.py +842 -0
  1127. vllm/model_executor/models/llava_next.py +583 -0
  1128. vllm/model_executor/models/llava_next_video.py +467 -0
  1129. vllm/model_executor/models/llava_onevision.py +923 -0
  1130. vllm/model_executor/models/longcat_flash.py +749 -0
  1131. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1132. vllm/model_executor/models/mamba.py +276 -0
  1133. vllm/model_executor/models/mamba2.py +289 -0
  1134. vllm/model_executor/models/medusa.py +179 -0
  1135. vllm/model_executor/models/midashenglm.py +827 -0
  1136. vllm/model_executor/models/mimo.py +188 -0
  1137. vllm/model_executor/models/mimo_mtp.py +294 -0
  1138. vllm/model_executor/models/minicpm.py +664 -0
  1139. vllm/model_executor/models/minicpm3.py +242 -0
  1140. vllm/model_executor/models/minicpm_eagle.py +389 -0
  1141. vllm/model_executor/models/minicpmo.py +768 -0
  1142. vllm/model_executor/models/minicpmv.py +1745 -0
  1143. vllm/model_executor/models/minimax_m2.py +552 -0
  1144. vllm/model_executor/models/minimax_text_01.py +1012 -0
  1145. vllm/model_executor/models/minimax_vl_01.py +396 -0
  1146. vllm/model_executor/models/mistral3.py +637 -0
  1147. vllm/model_executor/models/mixtral.py +621 -0
  1148. vllm/model_executor/models/mllama4.py +1147 -0
  1149. vllm/model_executor/models/mlp_speculator.py +235 -0
  1150. vllm/model_executor/models/modernbert.py +450 -0
  1151. vllm/model_executor/models/module_mapping.py +74 -0
  1152. vllm/model_executor/models/molmo.py +1555 -0
  1153. vllm/model_executor/models/moonvit.py +677 -0
  1154. vllm/model_executor/models/mpt.py +335 -0
  1155. vllm/model_executor/models/nano_nemotron_vl.py +1740 -0
  1156. vllm/model_executor/models/nemotron.py +518 -0
  1157. vllm/model_executor/models/nemotron_h.py +852 -0
  1158. vllm/model_executor/models/nemotron_nas.py +491 -0
  1159. vllm/model_executor/models/nemotron_vl.py +653 -0
  1160. vllm/model_executor/models/nvlm_d.py +216 -0
  1161. vllm/model_executor/models/olmo.py +414 -0
  1162. vllm/model_executor/models/olmo2.py +454 -0
  1163. vllm/model_executor/models/olmoe.py +498 -0
  1164. vllm/model_executor/models/openpangu.py +1062 -0
  1165. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1166. vllm/model_executor/models/opt.py +426 -0
  1167. vllm/model_executor/models/orion.py +372 -0
  1168. vllm/model_executor/models/ouro.py +516 -0
  1169. vllm/model_executor/models/ovis.py +559 -0
  1170. vllm/model_executor/models/ovis2_5.py +673 -0
  1171. vllm/model_executor/models/paddleocr_vl.py +1407 -0
  1172. vllm/model_executor/models/paligemma.py +412 -0
  1173. vllm/model_executor/models/persimmon.py +377 -0
  1174. vllm/model_executor/models/phi.py +374 -0
  1175. vllm/model_executor/models/phi3.py +18 -0
  1176. vllm/model_executor/models/phi3v.py +737 -0
  1177. vllm/model_executor/models/phi4_multimodal.py +1447 -0
  1178. vllm/model_executor/models/phi4mm.py +1253 -0
  1179. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1180. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1181. vllm/model_executor/models/phimoe.py +675 -0
  1182. vllm/model_executor/models/pixtral.py +1352 -0
  1183. vllm/model_executor/models/plamo2.py +981 -0
  1184. vllm/model_executor/models/qwen.py +368 -0
  1185. vllm/model_executor/models/qwen2.py +541 -0
  1186. vllm/model_executor/models/qwen2_5_omni_thinker.py +1246 -0
  1187. vllm/model_executor/models/qwen2_5_vl.py +1613 -0
  1188. vllm/model_executor/models/qwen2_audio.py +473 -0
  1189. vllm/model_executor/models/qwen2_moe.py +596 -0
  1190. vllm/model_executor/models/qwen2_rm.py +123 -0
  1191. vllm/model_executor/models/qwen2_vl.py +1670 -0
  1192. vllm/model_executor/models/qwen3.py +336 -0
  1193. vllm/model_executor/models/qwen3_moe.py +744 -0
  1194. vllm/model_executor/models/qwen3_next.py +1395 -0
  1195. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1196. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1721 -0
  1197. vllm/model_executor/models/qwen3_vl.py +1673 -0
  1198. vllm/model_executor/models/qwen3_vl_moe.py +415 -0
  1199. vllm/model_executor/models/qwen_vl.py +802 -0
  1200. vllm/model_executor/models/radio.py +555 -0
  1201. vllm/model_executor/models/registry.py +1155 -0
  1202. vllm/model_executor/models/roberta.py +259 -0
  1203. vllm/model_executor/models/rvl.py +107 -0
  1204. vllm/model_executor/models/seed_oss.py +497 -0
  1205. vllm/model_executor/models/siglip.py +1174 -0
  1206. vllm/model_executor/models/siglip2navit.py +724 -0
  1207. vllm/model_executor/models/skyworkr1v.py +953 -0
  1208. vllm/model_executor/models/smolvlm.py +38 -0
  1209. vllm/model_executor/models/solar.py +502 -0
  1210. vllm/model_executor/models/stablelm.py +359 -0
  1211. vllm/model_executor/models/starcoder2.py +367 -0
  1212. vllm/model_executor/models/step3_text.py +559 -0
  1213. vllm/model_executor/models/step3_vl.py +1148 -0
  1214. vllm/model_executor/models/swin.py +514 -0
  1215. vllm/model_executor/models/tarsier.py +619 -0
  1216. vllm/model_executor/models/telechat2.py +153 -0
  1217. vllm/model_executor/models/teleflm.py +78 -0
  1218. vllm/model_executor/models/terratorch.py +319 -0
  1219. vllm/model_executor/models/transformers/__init__.py +127 -0
  1220. vllm/model_executor/models/transformers/base.py +464 -0
  1221. vllm/model_executor/models/transformers/causal.py +65 -0
  1222. vllm/model_executor/models/transformers/legacy.py +90 -0
  1223. vllm/model_executor/models/transformers/moe.py +318 -0
  1224. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1225. vllm/model_executor/models/transformers/pooling.py +119 -0
  1226. vllm/model_executor/models/transformers/utils.py +207 -0
  1227. vllm/model_executor/models/ultravox.py +681 -0
  1228. vllm/model_executor/models/utils.py +877 -0
  1229. vllm/model_executor/models/vision.py +552 -0
  1230. vllm/model_executor/models/voxtral.py +845 -0
  1231. vllm/model_executor/models/whisper.py +959 -0
  1232. vllm/model_executor/models/zamba2.py +986 -0
  1233. vllm/model_executor/parameter.py +642 -0
  1234. vllm/model_executor/utils.py +94 -0
  1235. vllm/model_executor/warmup/__init__.py +0 -0
  1236. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1237. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1238. vllm/multimodal/__init__.py +40 -0
  1239. vllm/multimodal/audio.py +118 -0
  1240. vllm/multimodal/base.py +26 -0
  1241. vllm/multimodal/cache.py +755 -0
  1242. vllm/multimodal/evs.py +294 -0
  1243. vllm/multimodal/hasher.py +106 -0
  1244. vllm/multimodal/image.py +130 -0
  1245. vllm/multimodal/inputs.py +1036 -0
  1246. vllm/multimodal/parse.py +544 -0
  1247. vllm/multimodal/processing.py +2186 -0
  1248. vllm/multimodal/profiling.py +369 -0
  1249. vllm/multimodal/registry.py +360 -0
  1250. vllm/multimodal/utils.py +512 -0
  1251. vllm/multimodal/video.py +306 -0
  1252. vllm/outputs.py +345 -0
  1253. vllm/platforms/__init__.py +277 -0
  1254. vllm/platforms/cpu.py +414 -0
  1255. vllm/platforms/cuda.py +657 -0
  1256. vllm/platforms/interface.py +639 -0
  1257. vllm/platforms/rocm.py +466 -0
  1258. vllm/platforms/tpu.py +276 -0
  1259. vllm/platforms/xpu.py +274 -0
  1260. vllm/plugins/__init__.py +78 -0
  1261. vllm/plugins/io_processors/__init__.py +68 -0
  1262. vllm/plugins/io_processors/interface.py +77 -0
  1263. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1264. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1265. vllm/pooling_params.py +228 -0
  1266. vllm/profiler/__init__.py +0 -0
  1267. vllm/profiler/gpu_profiler.py +37 -0
  1268. vllm/profiler/layerwise_profile.py +392 -0
  1269. vllm/profiler/utils.py +151 -0
  1270. vllm/py.typed +2 -0
  1271. vllm/ray/__init__.py +0 -0
  1272. vllm/ray/lazy_utils.py +26 -0
  1273. vllm/ray/ray_env.py +79 -0
  1274. vllm/reasoning/__init__.py +92 -0
  1275. vllm/reasoning/abs_reasoning_parsers.py +290 -0
  1276. vllm/reasoning/basic_parsers.py +162 -0
  1277. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1278. vllm/reasoning/deepseek_v3_reasoning_parser.py +62 -0
  1279. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1280. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1281. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1282. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1283. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1284. vllm/reasoning/identity_reasoning_parser.py +58 -0
  1285. vllm/reasoning/minimax_m2_reasoning_parser.py +67 -0
  1286. vllm/reasoning/mistral_reasoning_parser.py +55 -0
  1287. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1288. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1289. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1290. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1291. vllm/sampling_params.py +669 -0
  1292. vllm/scalar_type.py +355 -0
  1293. vllm/scripts.py +17 -0
  1294. vllm/sequence.py +98 -0
  1295. vllm/tasks.py +13 -0
  1296. vllm/third_party/__init__.py +0 -0
  1297. vllm/third_party/pynvml.py +6140 -0
  1298. vllm/tracing.py +135 -0
  1299. vllm/transformers_utils/__init__.py +26 -0
  1300. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1301. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1302. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1303. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1304. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1305. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1306. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1307. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1308. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1309. vllm/transformers_utils/config.py +1203 -0
  1310. vllm/transformers_utils/config_parser_base.py +20 -0
  1311. vllm/transformers_utils/configs/__init__.py +70 -0
  1312. vllm/transformers_utils/configs/afmoe.py +84 -0
  1313. vllm/transformers_utils/configs/arctic.py +206 -0
  1314. vllm/transformers_utils/configs/chatglm.py +75 -0
  1315. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1316. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1317. vllm/transformers_utils/configs/eagle.py +84 -0
  1318. vllm/transformers_utils/configs/falcon.py +89 -0
  1319. vllm/transformers_utils/configs/flex_olmo.py +77 -0
  1320. vllm/transformers_utils/configs/jais.py +243 -0
  1321. vllm/transformers_utils/configs/kimi_linear.py +144 -0
  1322. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1323. vllm/transformers_utils/configs/lfm2_moe.py +159 -0
  1324. vllm/transformers_utils/configs/medusa.py +65 -0
  1325. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1326. vllm/transformers_utils/configs/mistral.py +174 -0
  1327. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1328. vllm/transformers_utils/configs/moonvit.py +33 -0
  1329. vllm/transformers_utils/configs/nemotron.py +212 -0
  1330. vllm/transformers_utils/configs/nemotron_h.py +282 -0
  1331. vllm/transformers_utils/configs/olmo3.py +79 -0
  1332. vllm/transformers_utils/configs/ovis.py +182 -0
  1333. vllm/transformers_utils/configs/qwen3_next.py +274 -0
  1334. vllm/transformers_utils/configs/radio.py +89 -0
  1335. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1336. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1337. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1338. vllm/transformers_utils/configs/step3_vl.py +174 -0
  1339. vllm/transformers_utils/configs/ultravox.py +118 -0
  1340. vllm/transformers_utils/detokenizer_utils.py +198 -0
  1341. vllm/transformers_utils/dynamic_module.py +59 -0
  1342. vllm/transformers_utils/processor.py +402 -0
  1343. vllm/transformers_utils/processors/__init__.py +15 -0
  1344. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1345. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1346. vllm/transformers_utils/processors/ovis.py +453 -0
  1347. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1348. vllm/transformers_utils/runai_utils.py +104 -0
  1349. vllm/transformers_utils/s3_utils.py +95 -0
  1350. vllm/transformers_utils/tokenizer.py +293 -0
  1351. vllm/transformers_utils/tokenizer_base.py +155 -0
  1352. vllm/transformers_utils/tokenizers/__init__.py +16 -0
  1353. vllm/transformers_utils/tokenizers/mistral.py +502 -0
  1354. vllm/transformers_utils/utils.py +130 -0
  1355. vllm/triton_utils/__init__.py +19 -0
  1356. vllm/triton_utils/importing.py +103 -0
  1357. vllm/usage/__init__.py +0 -0
  1358. vllm/usage/usage_lib.py +294 -0
  1359. vllm/utils/__init__.py +82 -0
  1360. vllm/utils/argparse_utils.py +487 -0
  1361. vllm/utils/async_utils.py +303 -0
  1362. vllm/utils/cache.py +214 -0
  1363. vllm/utils/collection_utils.py +139 -0
  1364. vllm/utils/counter.py +45 -0
  1365. vllm/utils/deep_gemm.py +391 -0
  1366. vllm/utils/flashinfer.py +490 -0
  1367. vllm/utils/func_utils.py +236 -0
  1368. vllm/utils/gc_utils.py +147 -0
  1369. vllm/utils/hashing.py +63 -0
  1370. vllm/utils/import_utils.py +411 -0
  1371. vllm/utils/jsontree.py +165 -0
  1372. vllm/utils/math_utils.py +32 -0
  1373. vllm/utils/mem_constants.py +13 -0
  1374. vllm/utils/mem_utils.py +232 -0
  1375. vllm/utils/nccl.py +64 -0
  1376. vllm/utils/network_utils.py +331 -0
  1377. vllm/utils/platform_utils.py +59 -0
  1378. vllm/utils/profiling.py +56 -0
  1379. vllm/utils/registry.py +49 -0
  1380. vllm/utils/serial_utils.py +169 -0
  1381. vllm/utils/system_utils.py +229 -0
  1382. vllm/utils/tensor_schema.py +255 -0
  1383. vllm/utils/torch_utils.py +657 -0
  1384. vllm/v1/__init__.py +0 -0
  1385. vllm/v1/attention/__init__.py +0 -0
  1386. vllm/v1/attention/backends/__init__.py +0 -0
  1387. vllm/v1/attention/backends/cpu_attn.py +496 -0
  1388. vllm/v1/attention/backends/flash_attn.py +1028 -0
  1389. vllm/v1/attention/backends/flashinfer.py +1572 -0
  1390. vllm/v1/attention/backends/flex_attention.py +926 -0
  1391. vllm/v1/attention/backends/gdn_attn.py +387 -0
  1392. vllm/v1/attention/backends/linear_attn.py +74 -0
  1393. vllm/v1/attention/backends/mamba1_attn.py +165 -0
  1394. vllm/v1/attention/backends/mamba2_attn.py +354 -0
  1395. vllm/v1/attention/backends/mamba_attn.py +115 -0
  1396. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1397. vllm/v1/attention/backends/mla/common.py +2031 -0
  1398. vllm/v1/attention/backends/mla/cutlass_mla.py +275 -0
  1399. vllm/v1/attention/backends/mla/flashattn_mla.py +337 -0
  1400. vllm/v1/attention/backends/mla/flashinfer_mla.py +171 -0
  1401. vllm/v1/attention/backends/mla/flashmla.py +314 -0
  1402. vllm/v1/attention/backends/mla/flashmla_sparse.py +548 -0
  1403. vllm/v1/attention/backends/mla/indexer.py +362 -0
  1404. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +294 -0
  1405. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1406. vllm/v1/attention/backends/pallas.py +436 -0
  1407. vllm/v1/attention/backends/rocm_aiter_fa.py +816 -0
  1408. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +196 -0
  1409. vllm/v1/attention/backends/rocm_attn.py +362 -0
  1410. vllm/v1/attention/backends/short_conv_attn.py +105 -0
  1411. vllm/v1/attention/backends/tree_attn.py +425 -0
  1412. vllm/v1/attention/backends/triton_attn.py +373 -0
  1413. vllm/v1/attention/backends/utils.py +1116 -0
  1414. vllm/v1/attention/backends/xformers.py +417 -0
  1415. vllm/v1/core/__init__.py +0 -0
  1416. vllm/v1/core/block_pool.py +428 -0
  1417. vllm/v1/core/encoder_cache_manager.py +343 -0
  1418. vllm/v1/core/kv_cache_coordinator.py +480 -0
  1419. vllm/v1/core/kv_cache_manager.py +420 -0
  1420. vllm/v1/core/kv_cache_utils.py +1340 -0
  1421. vllm/v1/core/sched/__init__.py +0 -0
  1422. vllm/v1/core/sched/async_scheduler.py +62 -0
  1423. vllm/v1/core/sched/interface.py +181 -0
  1424. vllm/v1/core/sched/output.py +202 -0
  1425. vllm/v1/core/sched/request_queue.py +221 -0
  1426. vllm/v1/core/sched/scheduler.py +1617 -0
  1427. vllm/v1/core/sched/utils.py +72 -0
  1428. vllm/v1/core/single_type_kv_cache_manager.py +736 -0
  1429. vllm/v1/cudagraph_dispatcher.py +148 -0
  1430. vllm/v1/engine/__init__.py +206 -0
  1431. vllm/v1/engine/async_llm.py +797 -0
  1432. vllm/v1/engine/coordinator.py +377 -0
  1433. vllm/v1/engine/core.py +1420 -0
  1434. vllm/v1/engine/core_client.py +1400 -0
  1435. vllm/v1/engine/detokenizer.py +351 -0
  1436. vllm/v1/engine/exceptions.py +18 -0
  1437. vllm/v1/engine/llm_engine.py +408 -0
  1438. vllm/v1/engine/logprobs.py +182 -0
  1439. vllm/v1/engine/output_processor.py +642 -0
  1440. vllm/v1/engine/parallel_sampling.py +145 -0
  1441. vllm/v1/engine/processor.py +621 -0
  1442. vllm/v1/engine/utils.py +1072 -0
  1443. vllm/v1/executor/__init__.py +6 -0
  1444. vllm/v1/executor/abstract.py +352 -0
  1445. vllm/v1/executor/multiproc_executor.py +877 -0
  1446. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1447. vllm/v1/executor/ray_executor.py +626 -0
  1448. vllm/v1/executor/ray_utils.py +465 -0
  1449. vllm/v1/executor/uniproc_executor.py +183 -0
  1450. vllm/v1/kv_cache_interface.py +403 -0
  1451. vllm/v1/kv_offload/__init__.py +0 -0
  1452. vllm/v1/kv_offload/abstract.py +161 -0
  1453. vllm/v1/kv_offload/arc_manager.py +237 -0
  1454. vllm/v1/kv_offload/backend.py +97 -0
  1455. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1456. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1457. vllm/v1/kv_offload/cpu.py +93 -0
  1458. vllm/v1/kv_offload/factory.py +56 -0
  1459. vllm/v1/kv_offload/lru_manager.py +139 -0
  1460. vllm/v1/kv_offload/mediums.py +39 -0
  1461. vllm/v1/kv_offload/spec.py +62 -0
  1462. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1463. vllm/v1/kv_offload/worker/cpu_gpu.py +185 -0
  1464. vllm/v1/kv_offload/worker/worker.py +144 -0
  1465. vllm/v1/metrics/__init__.py +0 -0
  1466. vllm/v1/metrics/loggers.py +1238 -0
  1467. vllm/v1/metrics/prometheus.py +82 -0
  1468. vllm/v1/metrics/ray_wrappers.py +169 -0
  1469. vllm/v1/metrics/reader.py +257 -0
  1470. vllm/v1/metrics/stats.py +420 -0
  1471. vllm/v1/outputs.py +249 -0
  1472. vllm/v1/pool/__init__.py +0 -0
  1473. vllm/v1/pool/metadata.py +82 -0
  1474. vllm/v1/request.py +259 -0
  1475. vllm/v1/sample/__init__.py +0 -0
  1476. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1477. vllm/v1/sample/logits_processor/builtin.py +274 -0
  1478. vllm/v1/sample/logits_processor/interface.py +106 -0
  1479. vllm/v1/sample/logits_processor/state.py +165 -0
  1480. vllm/v1/sample/metadata.py +44 -0
  1481. vllm/v1/sample/ops/__init__.py +0 -0
  1482. vllm/v1/sample/ops/bad_words.py +52 -0
  1483. vllm/v1/sample/ops/logprobs.py +25 -0
  1484. vllm/v1/sample/ops/penalties.py +57 -0
  1485. vllm/v1/sample/ops/topk_topp_sampler.py +290 -0
  1486. vllm/v1/sample/rejection_sampler.py +793 -0
  1487. vllm/v1/sample/sampler.py +316 -0
  1488. vllm/v1/sample/tpu/__init__.py +0 -0
  1489. vllm/v1/sample/tpu/metadata.py +120 -0
  1490. vllm/v1/sample/tpu/sampler.py +215 -0
  1491. vllm/v1/serial_utils.py +532 -0
  1492. vllm/v1/spec_decode/__init__.py +0 -0
  1493. vllm/v1/spec_decode/eagle.py +1225 -0
  1494. vllm/v1/spec_decode/medusa.py +73 -0
  1495. vllm/v1/spec_decode/metadata.py +66 -0
  1496. vllm/v1/spec_decode/metrics.py +224 -0
  1497. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1498. vllm/v1/spec_decode/suffix_decoding.py +103 -0
  1499. vllm/v1/spec_decode/utils.py +16 -0
  1500. vllm/v1/structured_output/__init__.py +338 -0
  1501. vllm/v1/structured_output/backend_guidance.py +265 -0
  1502. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1503. vllm/v1/structured_output/backend_outlines.py +324 -0
  1504. vllm/v1/structured_output/backend_types.py +136 -0
  1505. vllm/v1/structured_output/backend_xgrammar.py +362 -0
  1506. vllm/v1/structured_output/request.py +94 -0
  1507. vllm/v1/structured_output/utils.py +469 -0
  1508. vllm/v1/utils.py +414 -0
  1509. vllm/v1/worker/__init__.py +0 -0
  1510. vllm/v1/worker/block_table.py +327 -0
  1511. vllm/v1/worker/cpu_model_runner.py +122 -0
  1512. vllm/v1/worker/cpu_worker.py +206 -0
  1513. vllm/v1/worker/dp_utils.py +230 -0
  1514. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1515. vllm/v1/worker/gpu_input_batch.py +975 -0
  1516. vllm/v1/worker/gpu_model_runner.py +5102 -0
  1517. vllm/v1/worker/gpu_ubatch_wrapper.py +466 -0
  1518. vllm/v1/worker/gpu_worker.py +894 -0
  1519. vllm/v1/worker/kv_connector_model_runner_mixin.py +144 -0
  1520. vllm/v1/worker/lora_model_runner_mixin.py +213 -0
  1521. vllm/v1/worker/tpu_input_batch.py +593 -0
  1522. vllm/v1/worker/tpu_model_runner.py +2173 -0
  1523. vllm/v1/worker/tpu_worker.py +355 -0
  1524. vllm/v1/worker/ubatch_utils.py +73 -0
  1525. vllm/v1/worker/ubatching.py +231 -0
  1526. vllm/v1/worker/utils.py +366 -0
  1527. vllm/v1/worker/worker_base.py +375 -0
  1528. vllm/v1/worker/xpu_model_runner.py +55 -0
  1529. vllm/v1/worker/xpu_worker.py +189 -0
  1530. vllm/version.py +39 -0
  1531. vllm/vllm_flash_attn/.gitkeep +0 -0
  1532. vllm_cpu_amxbf16-0.11.2.post2.dist-info/METADATA +345 -0
  1533. vllm_cpu_amxbf16-0.11.2.post2.dist-info/RECORD +1536 -0
  1534. vllm_cpu_amxbf16-0.11.2.post2.dist-info/WHEEL +5 -0
  1535. vllm_cpu_amxbf16-0.11.2.post2.dist-info/entry_points.txt +5 -0
  1536. vllm_cpu_amxbf16-0.11.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1400 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ import asyncio
4
+ import contextlib
5
+ import multiprocessing
6
+ import queue
7
+ import sys
8
+ import uuid
9
+ import weakref
10
+ from abc import ABC, abstractmethod
11
+ from collections import defaultdict, deque
12
+ from collections.abc import Awaitable, Callable, Sequence
13
+ from concurrent.futures import Future
14
+ from dataclasses import dataclass
15
+ from threading import Thread
16
+ from typing import Any, TypeAlias, TypeVar
17
+
18
+ import msgspec.msgpack
19
+ import zmq
20
+ import zmq.asyncio
21
+
22
+ from vllm.config import VllmConfig
23
+ from vllm.logger import init_logger
24
+ from vllm.lora.request import LoRARequest
25
+ from vllm.tasks import SupportedTask
26
+ from vllm.utils.async_utils import in_loop
27
+ from vllm.utils.network_utils import (
28
+ close_sockets,
29
+ get_open_port,
30
+ get_open_zmq_inproc_path,
31
+ make_zmq_socket,
32
+ )
33
+ from vllm.v1.engine import (
34
+ EngineCoreOutputs,
35
+ EngineCoreRequest,
36
+ EngineCoreRequestType,
37
+ ReconfigureDistributedRequest,
38
+ ReconfigureRankType,
39
+ UtilityOutput,
40
+ )
41
+ from vllm.v1.engine.coordinator import DPCoordinator
42
+ from vllm.v1.engine.core import EngineCore, EngineCoreProc
43
+ from vllm.v1.engine.exceptions import EngineDeadError
44
+ from vllm.v1.engine.utils import (
45
+ CoreEngineActorManager,
46
+ CoreEngineProcManager,
47
+ launch_core_engines,
48
+ )
49
+ from vllm.v1.executor import Executor
50
+ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
51
+
52
+ logger = init_logger(__name__)
53
+
54
+ AnyFuture: TypeAlias = asyncio.Future[Any] | Future[Any]
55
+
56
+ _R = TypeVar("_R") # Return type for collective_rpc
57
+
58
+ EngineIdentity = bytes
59
+
60
+
61
+ class EngineCoreClient(ABC):
62
+ """
63
+ EngineCoreClient: subclasses handle different methods for pushing
64
+ and pulling from the EngineCore for asyncio / multiprocessing.
65
+
66
+ Subclasses:
67
+ * InprocClient: In process EngineCore (for V0-style LLMEngine use)
68
+ * SyncMPClient: ZMQ + background proc EngineCore (for LLM)
69
+ * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
70
+ """
71
+
72
+ @staticmethod
73
+ def make_client(
74
+ multiprocess_mode: bool,
75
+ asyncio_mode: bool,
76
+ vllm_config: VllmConfig,
77
+ executor_class: type[Executor],
78
+ log_stats: bool,
79
+ ) -> "EngineCoreClient":
80
+ # TODO: support this for debugging purposes.
81
+ if asyncio_mode and not multiprocess_mode:
82
+ raise NotImplementedError(
83
+ "Running EngineCore in asyncio without multiprocessing "
84
+ "is not currently supported."
85
+ )
86
+
87
+ if multiprocess_mode and asyncio_mode:
88
+ return EngineCoreClient.make_async_mp_client(
89
+ vllm_config, executor_class, log_stats
90
+ )
91
+
92
+ if multiprocess_mode and not asyncio_mode:
93
+ return SyncMPClient(vllm_config, executor_class, log_stats)
94
+
95
+ return InprocClient(vllm_config, executor_class, log_stats)
96
+
97
+ @staticmethod
98
+ def make_async_mp_client(
99
+ vllm_config: VllmConfig,
100
+ executor_class: type[Executor],
101
+ log_stats: bool,
102
+ client_addresses: dict[str, str] | None = None,
103
+ client_count: int = 1,
104
+ client_index: int = 0,
105
+ ) -> "MPClient":
106
+ parallel_config = vllm_config.parallel_config
107
+ client_args = (
108
+ vllm_config,
109
+ executor_class,
110
+ log_stats,
111
+ client_addresses,
112
+ client_count,
113
+ client_index,
114
+ )
115
+ if parallel_config.data_parallel_size > 1:
116
+ if parallel_config.data_parallel_external_lb:
117
+ # External load balancer - client per DP rank.
118
+ return DPAsyncMPClient(*client_args)
119
+ # Internal load balancer - client balances to all DP ranks.
120
+ return DPLBAsyncMPClient(*client_args)
121
+ return AsyncMPClient(*client_args)
122
+
123
+ @abstractmethod
124
+ def shutdown(self): ...
125
+
126
+ def get_output(self) -> EngineCoreOutputs:
127
+ raise NotImplementedError
128
+
129
+ def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
130
+ raise NotImplementedError
131
+
132
+ def add_request(self, request: EngineCoreRequest) -> None:
133
+ raise NotImplementedError
134
+
135
+ def profile(self, is_start: bool = True) -> None:
136
+ raise NotImplementedError
137
+
138
+ def reset_mm_cache(self) -> None:
139
+ raise NotImplementedError
140
+
141
+ def reset_prefix_cache(self) -> None:
142
+ raise NotImplementedError
143
+
144
+ def sleep(self, level: int = 1) -> None:
145
+ raise NotImplementedError
146
+
147
+ def wake_up(self, tags: list[str] | None = None) -> None:
148
+ raise NotImplementedError
149
+
150
+ def is_sleeping(self) -> bool:
151
+ raise NotImplementedError
152
+
153
+ def execute_dummy_batch(self) -> None:
154
+ raise NotImplementedError
155
+
156
+ async def execute_dummy_batch_async(self) -> None:
157
+ raise NotImplementedError
158
+
159
+ def abort_requests(self, request_ids: list[str]) -> None:
160
+ raise NotImplementedError
161
+
162
+ def add_lora(self, lora_request: LoRARequest) -> bool:
163
+ raise NotImplementedError
164
+
165
+ def remove_lora(self, lora_id: int) -> bool:
166
+ raise NotImplementedError
167
+
168
+ def list_loras(self) -> set[int]:
169
+ raise NotImplementedError
170
+
171
+ def pin_lora(self, lora_id: int) -> bool:
172
+ raise NotImplementedError
173
+
174
+ def save_sharded_state(
175
+ self, path: str, pattern: str | None = None, max_size: int | None = None
176
+ ) -> None:
177
+ raise NotImplementedError
178
+
179
+ def collective_rpc(
180
+ self,
181
+ method: str | Callable[..., _R],
182
+ timeout: float | None = None,
183
+ args: tuple = (),
184
+ kwargs: dict[str, Any] | None = None,
185
+ ) -> list[_R]:
186
+ raise NotImplementedError
187
+
188
+ def dp_engines_running(self) -> bool:
189
+ """Returns True id data parallel engines are collectively in a
190
+ running state."""
191
+ raise NotImplementedError
192
+
193
+ async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
194
+ raise NotImplementedError
195
+
196
+ async def get_output_async(self) -> EngineCoreOutputs:
197
+ raise NotImplementedError
198
+
199
+ async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
200
+ raise NotImplementedError
201
+
202
+ async def add_request_async(self, request: EngineCoreRequest) -> None:
203
+ raise NotImplementedError
204
+
205
+ async def profile_async(self, is_start: bool = True) -> None:
206
+ raise NotImplementedError
207
+
208
+ async def reset_mm_cache_async(self) -> None:
209
+ raise NotImplementedError
210
+
211
+ async def reset_prefix_cache_async(self) -> None:
212
+ raise NotImplementedError
213
+
214
+ async def sleep_async(self, level: int = 1) -> None:
215
+ raise NotImplementedError
216
+
217
+ async def wake_up_async(self, tags: list[str] | None = None) -> None:
218
+ raise NotImplementedError
219
+
220
+ async def is_sleeping_async(self) -> bool:
221
+ raise NotImplementedError
222
+
223
+ async def abort_requests_async(self, request_ids: list[str]) -> None:
224
+ raise NotImplementedError
225
+
226
+ async def add_lora_async(self, lora_request: LoRARequest) -> bool:
227
+ raise NotImplementedError
228
+
229
+ async def remove_lora_async(self, lora_id: int) -> bool:
230
+ raise NotImplementedError
231
+
232
+ async def list_loras_async(self) -> set[int]:
233
+ raise NotImplementedError
234
+
235
+ async def pin_lora_async(self, lora_id: int) -> bool:
236
+ raise NotImplementedError
237
+
238
+ async def save_sharded_state_async(
239
+ self, path: str, pattern: str | None = None, max_size: int | None = None
240
+ ) -> None:
241
+ raise NotImplementedError
242
+
243
+ async def collective_rpc_async(
244
+ self,
245
+ method: str | Callable[..., _R],
246
+ timeout: float | None = None,
247
+ args: tuple = (),
248
+ kwargs: dict[str, Any] | None = None,
249
+ ) -> list[_R]:
250
+ raise NotImplementedError
251
+
252
+
253
+ class InprocClient(EngineCoreClient):
254
+ """
255
+ InprocClient: client for in-process EngineCore. Intended
256
+ for use in LLMEngine for V0-style add_request() and step()
257
+ EngineCore setup in this process (no busy loop).
258
+
259
+ * pushes EngineCoreRequest directly into the EngineCore
260
+ * pulls EngineCoreOutputs by stepping the EngineCore
261
+ """
262
+
263
+ def __init__(self, *args, **kwargs):
264
+ self.engine_core = EngineCore(*args, **kwargs)
265
+
266
+ def get_output(self) -> EngineCoreOutputs:
267
+ outputs, _ = self.engine_core.step_fn()
268
+ return outputs and outputs.get(0) or EngineCoreOutputs()
269
+
270
+ def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
271
+ return self.engine_core.get_supported_tasks()
272
+
273
+ def add_request(self, request: EngineCoreRequest) -> None:
274
+ req, request_wave = self.engine_core.preprocess_add_request(request)
275
+ self.engine_core.add_request(req, request_wave)
276
+
277
+ def abort_requests(self, request_ids: list[str]) -> None:
278
+ if len(request_ids) > 0:
279
+ self.engine_core.abort_requests(request_ids)
280
+
281
+ def shutdown(self) -> None:
282
+ self.engine_core.shutdown()
283
+
284
+ def profile(self, is_start: bool = True) -> None:
285
+ self.engine_core.profile(is_start)
286
+
287
+ def reset_mm_cache(self) -> None:
288
+ self.engine_core.reset_mm_cache()
289
+
290
+ def reset_prefix_cache(self) -> None:
291
+ self.engine_core.reset_prefix_cache()
292
+
293
+ def sleep(self, level: int = 1) -> None:
294
+ self.engine_core.sleep(level)
295
+
296
+ def wake_up(self, tags: list[str] | None = None) -> None:
297
+ self.engine_core.wake_up(tags)
298
+
299
+ def is_sleeping(self) -> bool:
300
+ return self.engine_core.is_sleeping()
301
+
302
+ def execute_dummy_batch(self) -> None:
303
+ self.engine_core.execute_dummy_batch()
304
+
305
+ def add_lora(self, lora_request: LoRARequest) -> bool:
306
+ return self.engine_core.add_lora(lora_request)
307
+
308
+ def remove_lora(self, lora_id: int) -> bool:
309
+ return self.engine_core.remove_lora(lora_id)
310
+
311
+ def list_loras(self) -> set[int]:
312
+ return self.engine_core.list_loras()
313
+
314
+ def pin_lora(self, lora_id: int) -> bool:
315
+ return self.engine_core.pin_lora(lora_id)
316
+
317
+ def save_sharded_state(
318
+ self, path: str, pattern: str | None = None, max_size: int | None = None
319
+ ) -> None:
320
+ self.engine_core.save_sharded_state(path, pattern, max_size)
321
+
322
+ def collective_rpc(
323
+ self,
324
+ method: str | Callable[..., _R],
325
+ timeout: float | None = None,
326
+ args: tuple = (),
327
+ kwargs: dict[str, Any] | None = None,
328
+ ) -> list[_R]:
329
+ return self.engine_core.collective_rpc(method, timeout, args, kwargs)
330
+
331
+ def dp_engines_running(self) -> bool:
332
+ return False
333
+
334
+
335
+ @dataclass
336
+ class BackgroundResources:
337
+ """Used as a finalizer for clean shutdown, avoiding
338
+ circular reference back to the client object."""
339
+
340
+ ctx: zmq.Context
341
+ # If CoreEngineProcManager, it manages local engines;
342
+ # if CoreEngineActorManager, it manages all engines.
343
+ engine_manager: CoreEngineProcManager | CoreEngineActorManager | None = None
344
+ coordinator: DPCoordinator | None = None
345
+ output_socket: zmq.Socket | zmq.asyncio.Socket | None = None
346
+ input_socket: zmq.Socket | zmq.asyncio.Socket | None = None
347
+ first_req_send_socket: zmq.asyncio.Socket | None = None
348
+ first_req_rcv_socket: zmq.asyncio.Socket | None = None
349
+ stats_update_socket: zmq.asyncio.Socket | None = None
350
+ output_queue_task: asyncio.Task | None = None
351
+ stats_update_task: asyncio.Task | None = None
352
+ shutdown_path: str | None = None
353
+
354
+ # Set if any of the engines are dead. Here so that the output
355
+ # processing threads can access it without holding a ref to the client.
356
+ engine_dead: bool = False
357
+
358
+ def __call__(self):
359
+ """Clean up background resources."""
360
+
361
+ self.engine_dead = True
362
+ if self.engine_manager is not None:
363
+ self.engine_manager.close()
364
+ if self.coordinator is not None:
365
+ self.coordinator.close()
366
+
367
+ if isinstance(self.output_socket, zmq.asyncio.Socket):
368
+ # Async case.
369
+ loop = self.output_queue_task._loop if self.output_queue_task else None
370
+
371
+ sockets = (
372
+ self.output_socket,
373
+ self.input_socket,
374
+ self.first_req_send_socket,
375
+ self.first_req_rcv_socket,
376
+ self.stats_update_socket,
377
+ )
378
+
379
+ tasks = (self.output_queue_task, self.stats_update_task)
380
+
381
+ def close_sockets_and_tasks():
382
+ close_sockets(sockets)
383
+ for task in tasks:
384
+ if task is not None and not task.done():
385
+ with contextlib.suppress(Exception):
386
+ task.cancel()
387
+
388
+ if loop is not None:
389
+ if in_loop(loop):
390
+ close_sockets_and_tasks()
391
+ elif not loop.is_closed():
392
+ loop.call_soon_threadsafe(close_sockets_and_tasks)
393
+ else:
394
+ # Loop has been closed, try to clean up directly.
395
+ del tasks
396
+ del close_sockets_and_tasks
397
+ close_sockets(sockets)
398
+ del self.output_queue_task
399
+ del self.stats_update_task
400
+ else:
401
+ # Sync case.
402
+
403
+ # ZMQ context termination can hang if the sockets
404
+ # aren't explicitly closed first.
405
+ close_sockets((self.output_socket, self.input_socket))
406
+
407
+ if self.shutdown_path is not None:
408
+ # We must ensure that the sync output socket is
409
+ # closed cleanly in its own thread.
410
+ with self.ctx.socket(zmq.PAIR) as shutdown_sender:
411
+ shutdown_sender.connect(self.shutdown_path)
412
+ # Send shutdown signal.
413
+ shutdown_sender.send(b"")
414
+
415
+ def validate_alive(self, frames: Sequence[zmq.Frame]):
416
+ if len(frames) == 1 and (frames[0].buffer == EngineCoreProc.ENGINE_CORE_DEAD):
417
+ self.engine_dead = True
418
+ raise EngineDeadError()
419
+
420
+
421
+ class MPClient(EngineCoreClient):
422
+ """
423
+ MPClient: base client for multi-proc EngineCore.
424
+ EngineCore runs in a background process busy loop, getting
425
+ new EngineCoreRequests and returning EngineCoreOutputs
426
+
427
+ * pushes EngineCoreRequests via input_socket
428
+ * pulls EngineCoreOutputs via output_socket
429
+
430
+ * AsyncMPClient subclass for AsyncLLM usage
431
+ * SyncMPClient subclass for LLM usage
432
+ """
433
+
434
+ def __init__(
435
+ self,
436
+ asyncio_mode: bool,
437
+ vllm_config: VllmConfig,
438
+ executor_class: type[Executor],
439
+ log_stats: bool,
440
+ client_addresses: dict[str, str] | None = None,
441
+ ):
442
+ self.vllm_config = vllm_config
443
+ # Serialization setup.
444
+ self.encoder = MsgpackEncoder()
445
+ self.decoder = MsgpackDecoder(EngineCoreOutputs)
446
+
447
+ # ZMQ setup.
448
+ sync_ctx = zmq.Context(io_threads=2)
449
+ self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
450
+
451
+ # This will ensure resources created so far are closed
452
+ # when the client is garbage collected, even if an
453
+ # exception is raised mid-construction.
454
+ self.resources = BackgroundResources(ctx=sync_ctx)
455
+ self._finalizer = weakref.finalize(self, self.resources)
456
+ success = False
457
+ try:
458
+ # State used for data parallel.
459
+ self.engines_running = False
460
+
461
+ self.stats_update_address: str | None = None
462
+ if client_addresses:
463
+ # Engines are managed externally to this client.
464
+ input_address = client_addresses["input_address"]
465
+ output_address = client_addresses["output_address"]
466
+ self.stats_update_address = client_addresses.get("stats_update_address")
467
+ else:
468
+ # Engines are managed by this client.
469
+ with launch_core_engines(vllm_config, executor_class, log_stats) as (
470
+ engine_manager,
471
+ coordinator,
472
+ addresses,
473
+ ):
474
+ self.resources.coordinator = coordinator
475
+ self.resources.engine_manager = engine_manager
476
+
477
+ (input_address,) = addresses.inputs
478
+ (output_address,) = addresses.outputs
479
+ self.stats_update_address = addresses.frontend_stats_publish_address
480
+ if coordinator is not None:
481
+ assert self.stats_update_address == (
482
+ coordinator.get_stats_publish_address()
483
+ )
484
+
485
+ # Create input and output sockets.
486
+ self.input_socket = self.resources.input_socket = make_zmq_socket(
487
+ self.ctx, input_address, zmq.ROUTER, bind=True
488
+ )
489
+ self.resources.output_socket = make_zmq_socket(
490
+ self.ctx, output_address, zmq.PULL
491
+ )
492
+
493
+ parallel_config = vllm_config.parallel_config
494
+ dp_size = parallel_config.data_parallel_size
495
+ dp_rank = parallel_config.data_parallel_rank
496
+ dp_local_size = parallel_config.data_parallel_size_local
497
+ offline_mode = parallel_config.data_parallel_rank_local is not None
498
+ # Client manages local+remote EngineCores in pure internal LB case.
499
+ # Client manages local EngineCores in hybrid and external LB case.
500
+ local_engines_only = (
501
+ parallel_config.data_parallel_hybrid_lb
502
+ or parallel_config.data_parallel_external_lb
503
+ )
504
+
505
+ num_ranks = dp_local_size if local_engines_only else dp_size
506
+ self.engine_ranks_managed = (
507
+ [dp_rank] if offline_mode else list(range(dp_rank, dp_rank + num_ranks))
508
+ )
509
+ assert parallel_config.data_parallel_size_local <= len(
510
+ self.engine_ranks_managed
511
+ )
512
+
513
+ # ZMQ identity of each engine that this client will talk to.
514
+ self.core_engines: list[EngineIdentity] = [
515
+ rank.to_bytes(2, "little") for rank in self.engine_ranks_managed
516
+ ]
517
+
518
+ # Wait for ready messages from each engine on the input socket.
519
+ identities = set(self.core_engines)
520
+ sync_input_socket = zmq.Socket.shadow(self.input_socket)
521
+ while identities:
522
+ if not sync_input_socket.poll(timeout=600_000):
523
+ raise TimeoutError(
524
+ "Timed out waiting for engines to send"
525
+ "initial message on input socket."
526
+ )
527
+ identity, _ = sync_input_socket.recv_multipart()
528
+ identities.remove(identity)
529
+
530
+ self.core_engine: EngineIdentity = self.core_engines[0]
531
+ self.utility_results: dict[int, AnyFuture] = {}
532
+
533
+ # Request objects which may contain pytorch-allocated tensors
534
+ # that we need to keep references to until zmq is done with the
535
+ # underlying data.
536
+ self.pending_messages = deque[tuple[zmq.MessageTracker, Any]]()
537
+
538
+ # Start monitoring engine core processes for unexpected failures
539
+ self.start_engine_core_monitor()
540
+
541
+ success = True
542
+ finally:
543
+ if not success:
544
+ self._finalizer()
545
+
546
+ def shutdown(self):
547
+ # Terminate background resources.
548
+ self._finalizer()
549
+
550
+ def _format_exception(self, e: Exception) -> Exception:
551
+ """If errored, use EngineDeadError so root cause is clear."""
552
+ return (
553
+ EngineDeadError(suppress_context=True) if self.resources.engine_dead else e
554
+ )
555
+
556
+ def ensure_alive(self):
557
+ if self.resources.engine_dead:
558
+ raise EngineDeadError()
559
+
560
+ def add_pending_message(self, tracker: zmq.MessageTracker, msg: Any):
561
+ if not tracker.done:
562
+ self.pending_messages.appendleft((tracker, msg))
563
+
564
+ def free_pending_messages(self):
565
+ while self.pending_messages and self.pending_messages[-1][0].done:
566
+ self.pending_messages.pop()
567
+
568
+ def dp_engines_running(self) -> bool:
569
+ return self.engines_running
570
+
571
+ def start_engine_core_monitor(self):
572
+ """Start a monitor thread for engine core processes."""
573
+ engine_manager = self.resources.engine_manager
574
+ if (
575
+ engine_manager is None
576
+ or not hasattr(engine_manager, "processes")
577
+ or not engine_manager.processes
578
+ ):
579
+ # No engine processes to monitor
580
+ return
581
+
582
+ engine_processes = engine_manager.processes
583
+ self_ref = weakref.ref(self)
584
+
585
+ # Monitor engine core process liveness. If any die unexpectedly,
586
+ # logs an error, shuts down the client and invokes the failure
587
+ # callback to inform the engine.
588
+ def monitor_engine_cores():
589
+ sentinels = [proc.sentinel for proc in engine_processes]
590
+ died = multiprocessing.connection.wait(sentinels)
591
+ _self = self_ref()
592
+ if not _self or _self.resources.engine_dead:
593
+ return
594
+ _self.resources.engine_dead = True
595
+ proc_name = next(
596
+ proc.name for proc in engine_processes if proc.sentinel == died[0]
597
+ )
598
+ logger.error(
599
+ "Engine core proc %s died unexpectedly, shutting down client.",
600
+ proc_name,
601
+ )
602
+ _self.shutdown()
603
+ # Note: For MPClient, we don't have a failure callback mechanism
604
+ # like MultiprocExecutor, but we set engine_dead flag which will
605
+ # cause subsequent operations to raise EngineDeadError
606
+
607
+ Thread(
608
+ target=monitor_engine_cores, daemon=True, name="MPClientEngineMonitor"
609
+ ).start()
610
+
611
+
612
+ def _process_utility_output(
613
+ output: UtilityOutput, utility_results: dict[int, AnyFuture]
614
+ ):
615
+ """Set the result from a utility method in the waiting future."""
616
+ future = utility_results.pop(output.call_id)
617
+ failure_message = output.failure_message
618
+ try:
619
+ if failure_message is not None:
620
+ future.set_exception(Exception(failure_message))
621
+ else:
622
+ assert output.result is not None
623
+ future.set_result(output.result.result)
624
+ except asyncio.InvalidStateError:
625
+ # This can happen if the future is cancelled due to the
626
+ # original calling task being cancelled.
627
+ if failure_message is not None:
628
+ logger.error(
629
+ "Cancelled call to utility method failed with error: %s",
630
+ failure_message,
631
+ )
632
+
633
+
634
+ class SyncMPClient(MPClient):
635
+ """Synchronous client for multi-proc EngineCore."""
636
+
637
+ def __init__(
638
+ self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool
639
+ ):
640
+ super().__init__(
641
+ asyncio_mode=False,
642
+ vllm_config=vllm_config,
643
+ executor_class=executor_class,
644
+ log_stats=log_stats,
645
+ )
646
+
647
+ self.is_dp = self.vllm_config.parallel_config.data_parallel_size > 1
648
+ self.outputs_queue = queue.Queue[EngineCoreOutputs | Exception]()
649
+
650
+ # Ensure that the outputs socket processing thread does not have
651
+ # a ref to the client which prevents gc.
652
+ ctx = self.ctx
653
+ out_socket = self.resources.output_socket
654
+ decoder = self.decoder
655
+ utility_results = self.utility_results
656
+ outputs_queue = self.outputs_queue
657
+
658
+ shutdown_path = get_open_zmq_inproc_path()
659
+ resources = self.resources
660
+ resources.shutdown_path = shutdown_path
661
+
662
+ def process_outputs_socket():
663
+ assert isinstance(out_socket, zmq.Socket)
664
+ shutdown_socket = ctx.socket(zmq.PAIR)
665
+ try:
666
+ shutdown_socket.bind(shutdown_path)
667
+ poller = zmq.Poller()
668
+ poller.register(shutdown_socket, zmq.POLLIN)
669
+ poller.register(out_socket, zmq.POLLIN)
670
+ while True:
671
+ socks = poller.poll()
672
+ if not socks:
673
+ continue
674
+ if len(socks) == 2 or socks[0][0] == shutdown_socket:
675
+ # shutdown signal, exit thread.
676
+ break
677
+
678
+ frames = out_socket.recv_multipart(copy=False)
679
+ resources.validate_alive(frames)
680
+ outputs: EngineCoreOutputs = decoder.decode(frames)
681
+ if outputs.utility_output:
682
+ _process_utility_output(outputs.utility_output, utility_results)
683
+ else:
684
+ outputs_queue.put_nowait(outputs)
685
+ except Exception as e:
686
+ outputs_queue.put_nowait(e)
687
+ finally:
688
+ # Close sockets.
689
+ shutdown_socket.close(linger=0)
690
+ out_socket.close(linger=0)
691
+
692
+ # Process outputs from engine in separate thread.
693
+ self.output_queue_thread = Thread(
694
+ target=process_outputs_socket,
695
+ name="EngineCoreOutputQueueThread",
696
+ daemon=True,
697
+ )
698
+ self.output_queue_thread.start()
699
+
700
+ # The thread takes on responsibility for closing the socket.
701
+ self.resources.output_socket = None
702
+
703
+ def get_output(self) -> EngineCoreOutputs:
704
+ # If an exception arises in process_outputs_socket task,
705
+ # it is forwarded to the outputs_queue so we can raise it
706
+ # from this (run_output_handler) task to shut down the server.
707
+ outputs = self.outputs_queue.get()
708
+ if isinstance(outputs, Exception):
709
+ raise self._format_exception(outputs) from None
710
+ if outputs.wave_complete is not None:
711
+ self.engines_running = False
712
+ return outputs
713
+
714
+ def _send_input(self, request_type: EngineCoreRequestType, request: Any):
715
+ self.ensure_alive()
716
+ self.free_pending_messages()
717
+ # (Identity, RequestType, SerializedRequest)
718
+ msg = (self.core_engine, request_type.value, *self.encoder.encode(request))
719
+
720
+ if len(msg) <= 3:
721
+ # No auxiliary buffers => no tensor backing buffers in request.
722
+ self.input_socket.send_multipart(msg, copy=False)
723
+ return
724
+
725
+ tracker = self.input_socket.send_multipart(msg, copy=False, track=True)
726
+ self.add_pending_message(tracker, request)
727
+
728
+ def call_utility(self, method: str, *args) -> Any:
729
+ call_id = uuid.uuid1().int >> 64
730
+ future: Future[Any] = Future()
731
+ self.utility_results[call_id] = future
732
+ self._send_input(EngineCoreRequestType.UTILITY, (0, call_id, method, args))
733
+
734
+ return future.result()
735
+
736
+ def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
737
+ return self.call_utility("get_supported_tasks")
738
+
739
+ def add_request(self, request: EngineCoreRequest) -> None:
740
+ if self.is_dp:
741
+ self.engines_running = True
742
+ self._send_input(EngineCoreRequestType.ADD, request)
743
+
744
+ def abort_requests(self, request_ids: list[str]) -> None:
745
+ if request_ids and not self.resources.engine_dead:
746
+ self._send_input(EngineCoreRequestType.ABORT, request_ids)
747
+
748
+ def profile(self, is_start: bool = True) -> None:
749
+ self.call_utility("profile", is_start)
750
+
751
+ def reset_mm_cache(self) -> None:
752
+ self.call_utility("reset_mm_cache")
753
+
754
+ def reset_prefix_cache(self) -> None:
755
+ self.call_utility("reset_prefix_cache")
756
+
757
+ def add_lora(self, lora_request: LoRARequest) -> bool:
758
+ return self.call_utility("add_lora", lora_request)
759
+
760
+ def remove_lora(self, lora_id: int) -> bool:
761
+ return self.call_utility("remove_lora", lora_id)
762
+
763
+ def list_loras(self) -> set[int]:
764
+ return self.call_utility("list_loras")
765
+
766
+ def pin_lora(self, lora_id: int) -> bool:
767
+ return self.call_utility("pin_lora", lora_id)
768
+
769
+ def sleep(self, level: int = 1) -> None:
770
+ self.call_utility("sleep", level)
771
+
772
+ def wake_up(self, tags: list[str] | None = None) -> None:
773
+ self.call_utility("wake_up", tags)
774
+
775
+ def is_sleeping(self) -> bool:
776
+ return self.call_utility("is_sleeping")
777
+
778
+ def execute_dummy_batch(self) -> None:
779
+ self.call_utility("execute_dummy_batch")
780
+
781
+ def collective_rpc(
782
+ self,
783
+ method: str | Callable[..., _R],
784
+ timeout: float | None = None,
785
+ args: tuple = (),
786
+ kwargs: dict[str, Any] | None = None,
787
+ ) -> list[_R]:
788
+ return self.call_utility("collective_rpc", method, timeout, args, kwargs)
789
+
790
+ def save_sharded_state(
791
+ self, path: str, pattern: str | None = None, max_size: int | None = None
792
+ ) -> None:
793
+ self.call_utility("save_sharded_state", path, pattern, max_size)
794
+
795
+
796
+ class AsyncMPClient(MPClient):
797
+ """Asyncio-compatible client for multi-proc EngineCore."""
798
+
799
+ def __init__(
800
+ self,
801
+ vllm_config: VllmConfig,
802
+ executor_class: type[Executor],
803
+ log_stats: bool,
804
+ client_addresses: dict[str, str] | None = None,
805
+ client_count: int = 1,
806
+ client_index: int = 0,
807
+ ):
808
+ super().__init__(
809
+ asyncio_mode=True,
810
+ vllm_config=vllm_config,
811
+ executor_class=executor_class,
812
+ log_stats=log_stats,
813
+ client_addresses=client_addresses,
814
+ )
815
+
816
+ self.client_count = client_count
817
+ self.client_index = client_index
818
+ self.outputs_queue = asyncio.Queue[EngineCoreOutputs | Exception]()
819
+ try:
820
+ # If we are running in an asyncio event loop, start the queue task.
821
+ # Otherwise, it will be started lazily. If it is not started here,
822
+ # we could miss EXECUTOR_FAILED messages from engine core if they
823
+ # occur prior to any requests being sent.
824
+ asyncio.get_running_loop()
825
+ self._ensure_output_queue_task()
826
+ except RuntimeError:
827
+ pass
828
+
829
+ def _ensure_output_queue_task(self):
830
+ resources = self.resources
831
+ if resources.output_queue_task is not None:
832
+ return
833
+
834
+ # Perform IO in separate task to parallelize as much as possible.
835
+ # Avoid task having direct reference back to the client.
836
+ decoder = self.decoder
837
+ utility_results = self.utility_results
838
+ outputs_queue = self.outputs_queue
839
+ output_handler: (
840
+ Callable[[AsyncMPClient, EngineCoreOutputs], Awaitable[None]] | None
841
+ ) = getattr(self.__class__, "process_engine_outputs", None)
842
+ _self_ref = weakref.ref(self) if output_handler else None
843
+ output_socket = resources.output_socket
844
+ assert output_socket is not None
845
+
846
+ async def process_outputs_socket():
847
+ try:
848
+ while True:
849
+ frames = await output_socket.recv_multipart(copy=False)
850
+ resources.validate_alive(frames)
851
+ outputs: EngineCoreOutputs = decoder.decode(frames)
852
+ if outputs.utility_output:
853
+ _process_utility_output(outputs.utility_output, utility_results)
854
+ continue
855
+
856
+ if output_handler is not None:
857
+ assert _self_ref is not None
858
+ _self = _self_ref()
859
+ if not _self:
860
+ # Client has been garbage collected, abort.
861
+ return
862
+ await output_handler(_self, outputs)
863
+
864
+ if outputs.outputs or outputs.scheduler_stats:
865
+ outputs_queue.put_nowait(outputs)
866
+ except Exception as e:
867
+ outputs_queue.put_nowait(e)
868
+ except asyncio.CancelledError:
869
+ outputs_queue.put_nowait(EngineDeadError())
870
+
871
+ resources.output_queue_task = asyncio.create_task(
872
+ process_outputs_socket(), name="EngineCoreOutputQueueTask"
873
+ )
874
+
875
+ async def get_output_async(self) -> EngineCoreOutputs:
876
+ self._ensure_output_queue_task()
877
+ # If an exception arises in process_outputs_socket task,
878
+ # it is forwarded to the outputs_queue so we can raise it
879
+ # from this (run_output_handler) task to shut down the server.
880
+ assert self.outputs_queue is not None
881
+ outputs = await self.outputs_queue.get()
882
+ if isinstance(outputs, Exception):
883
+ raise self._format_exception(outputs) from None
884
+ return outputs
885
+
886
+ def _send_input(
887
+ self,
888
+ request_type: EngineCoreRequestType,
889
+ request: Any,
890
+ engine: EngineIdentity | None = None,
891
+ ) -> Awaitable[Any]:
892
+ if engine is None:
893
+ engine = self.core_engine
894
+
895
+ message = (request_type.value, *self.encoder.encode(request))
896
+ return self._send_input_message(message, engine, request)
897
+
898
+ def _send_input_message(
899
+ self, message: tuple[bytestr, ...], engine: EngineIdentity, objects: Any
900
+ ) -> Awaitable[Any]:
901
+ """
902
+ objects is a reference to retain until zmq is finished with the
903
+ buffers, in case they were extracted from tensors in the request.
904
+ """
905
+ self.ensure_alive()
906
+ self.free_pending_messages()
907
+
908
+ msg = (engine,) + message
909
+ if not objects or len(msg) <= 3:
910
+ # No auxiliary buffers => no tensor backing buffers in request.
911
+ return self.input_socket.send_multipart(msg, copy=False)
912
+
913
+ future: asyncio.Future[zmq.MessageTracker]
914
+ future = self.input_socket.send_multipart(msg, copy=False, track=True)
915
+
916
+ def add_pending(f: asyncio.Future[zmq.MessageTracker]):
917
+ with contextlib.suppress(BaseException):
918
+ self.add_pending_message(f.result(), objects)
919
+
920
+ future.add_done_callback(add_pending)
921
+ return future
922
+
923
+ async def call_utility_async(self, method: str, *args) -> Any:
924
+ return await self._call_utility_async(method, *args, engine=self.core_engine)
925
+
926
+ async def _call_utility_async(
927
+ self, method: str, *args, engine: EngineIdentity
928
+ ) -> Any:
929
+ call_id = uuid.uuid1().int >> 64
930
+ future = asyncio.get_running_loop().create_future()
931
+ self.utility_results[call_id] = future
932
+ message = (
933
+ EngineCoreRequestType.UTILITY.value,
934
+ *self.encoder.encode((self.client_index, call_id, method, args)),
935
+ )
936
+ await self._send_input_message(message, engine, args)
937
+ self._ensure_output_queue_task()
938
+ return await future
939
+
940
+ async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
941
+ return await self.call_utility_async("get_supported_tasks")
942
+
943
+ async def add_request_async(self, request: EngineCoreRequest) -> None:
944
+ request.client_index = self.client_index
945
+ await self._send_input(EngineCoreRequestType.ADD, request)
946
+ self._ensure_output_queue_task()
947
+
948
+ async def abort_requests_async(self, request_ids: list[str]) -> None:
949
+ if request_ids and not self.resources.engine_dead:
950
+ await self._send_input(EngineCoreRequestType.ABORT, request_ids)
951
+
952
+ async def profile_async(self, is_start: bool = True) -> None:
953
+ await self.call_utility_async("profile", is_start)
954
+
955
+ async def reset_mm_cache_async(self) -> None:
956
+ await self.call_utility_async("reset_mm_cache")
957
+
958
+ async def reset_prefix_cache_async(self) -> None:
959
+ await self.call_utility_async("reset_prefix_cache")
960
+
961
+ async def sleep_async(self, level: int = 1) -> None:
962
+ await self.call_utility_async("sleep", level)
963
+
964
+ async def wake_up_async(self, tags: list[str] | None = None) -> None:
965
+ await self.call_utility_async("wake_up", tags)
966
+
967
+ async def is_sleeping_async(self) -> bool:
968
+ return await self.call_utility_async("is_sleeping")
969
+
970
+ async def execute_dummy_batch_async(self) -> None:
971
+ await self.call_utility_async("execute_dummy_batch")
972
+
973
+ async def add_lora_async(self, lora_request: LoRARequest) -> bool:
974
+ return await self.call_utility_async("add_lora", lora_request)
975
+
976
+ async def remove_lora_async(self, lora_id: int) -> bool:
977
+ return await self.call_utility_async("remove_lora", lora_id)
978
+
979
+ async def list_loras_async(self) -> set[int]:
980
+ return await self.call_utility_async("list_loras")
981
+
982
+ async def pin_lora_async(self, lora_id: int) -> bool:
983
+ return await self.call_utility_async("pin_lora", lora_id)
984
+
985
+ async def save_sharded_state_async(
986
+ self, path: str, pattern: str | None = None, max_size: int | None = None
987
+ ) -> None:
988
+ await self.call_utility_async("save_sharded_state", path, pattern, max_size)
989
+
990
+ async def collective_rpc_async(
991
+ self,
992
+ method: str | Callable[..., _R],
993
+ timeout: float | None = None,
994
+ args: tuple = (),
995
+ kwargs: dict[str, Any] | None = None,
996
+ ) -> list[_R]:
997
+ return await self.call_utility_async(
998
+ "collective_rpc", method, timeout, args, kwargs
999
+ )
1000
+
1001
+
1002
+ class DPAsyncMPClient(AsyncMPClient):
1003
+ """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
1004
+ EngineCore. Assumes external load-balancing by default."""
1005
+
1006
+ def __init__(
1007
+ self,
1008
+ vllm_config: VllmConfig,
1009
+ executor_class: type[Executor],
1010
+ log_stats: bool,
1011
+ client_addresses: dict[str, str] | None = None,
1012
+ client_count: int = 1,
1013
+ client_index: int = 0,
1014
+ ):
1015
+ self.current_wave = 0
1016
+
1017
+ super().__init__(
1018
+ vllm_config,
1019
+ executor_class,
1020
+ log_stats,
1021
+ client_addresses,
1022
+ client_count,
1023
+ client_index,
1024
+ )
1025
+
1026
+ # List of [waiting, running] pair per engine.
1027
+ # Used only by DPLBAsyncMPClient subclass.
1028
+ self.lb_engines: list[list[int]] = [[0, 0] for _ in self.core_engines]
1029
+
1030
+ self.first_req_sock_addr = get_open_zmq_inproc_path()
1031
+ self.first_req_send_socket = self.resources.first_req_send_socket = (
1032
+ make_zmq_socket(self.ctx, self.first_req_sock_addr, zmq.PAIR, bind=True)
1033
+ )
1034
+ try:
1035
+ # If we are running in an asyncio event loop, start the stats task.
1036
+ # Otherwise, it will be started lazily.
1037
+ asyncio.get_running_loop()
1038
+ self._ensure_stats_update_task()
1039
+ except RuntimeError:
1040
+ pass
1041
+
1042
+ def _ensure_stats_update_task(self):
1043
+ resources = self.resources
1044
+ if resources.stats_update_task is not None:
1045
+ return
1046
+
1047
+ assert self.stats_update_address is not None
1048
+ stats_addr: str = self.stats_update_address
1049
+ assert len(self.engine_ranks_managed) > 0
1050
+ # NOTE: running and waiting counts are all global from
1051
+ # the Coordinator include all global EngineCores. This
1052
+ # slice includes just the cores managed by this client.
1053
+ count_slice = slice(
1054
+ self.engine_ranks_managed[0], self.engine_ranks_managed[-1] + 1
1055
+ )
1056
+
1057
+ async def run_engine_stats_update_task():
1058
+ with (
1059
+ make_zmq_socket(self.ctx, stats_addr, zmq.XSUB, linger=0) as socket,
1060
+ make_zmq_socket(
1061
+ self.ctx, self.first_req_sock_addr, zmq.PAIR, bind=False, linger=0
1062
+ ) as first_req_rcv_socket,
1063
+ ):
1064
+ assert isinstance(socket, zmq.asyncio.Socket)
1065
+ assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket)
1066
+ self.resources.stats_update_socket = socket
1067
+ self.resources.first_req_rcv_socket = first_req_rcv_socket
1068
+ # Send subscription message.
1069
+ await socket.send(b"\x01")
1070
+
1071
+ poller = zmq.asyncio.Poller()
1072
+ poller.register(socket, zmq.POLLIN)
1073
+ poller.register(first_req_rcv_socket, zmq.POLLIN)
1074
+
1075
+ while True:
1076
+ events = await poller.poll()
1077
+ if (
1078
+ not self.engines_running
1079
+ and len(events) == 2
1080
+ or (events[0][0] == first_req_rcv_socket)
1081
+ ):
1082
+ # Check if this is a regular request notification or
1083
+ # scale up notification
1084
+ buf = first_req_rcv_socket.recv(flags=zmq.NOBLOCK).result()
1085
+
1086
+ decoded = msgspec.msgpack.decode(buf)
1087
+ if (
1088
+ isinstance(decoded, (list, tuple))
1089
+ and len(decoded) == 2
1090
+ and decoded[0] == "SCALE_ELASTIC_EP"
1091
+ ):
1092
+ # Extract new engine count from the decoded message
1093
+ new_engine_count = decoded[1]
1094
+ # Send scale up notification to coordinator
1095
+ scale_msg = msgspec.msgpack.encode(
1096
+ ("SCALE_ELASTIC_EP", new_engine_count)
1097
+ )
1098
+ await socket.send(scale_msg)
1099
+ continue
1100
+
1101
+ # we're sending a request while the engines are
1102
+ # paused, so that it can wake the others up
1103
+ # (to run dummy EP loop).
1104
+ assert decoded[0] == "FIRST_REQ"
1105
+ target_eng_index = decoded[1]
1106
+ self.engines_running = True
1107
+ msg = msgspec.msgpack.encode(
1108
+ (target_eng_index, self.current_wave)
1109
+ )
1110
+ await socket.send(msg)
1111
+
1112
+ buf = None
1113
+ while True:
1114
+ # Drain all stats events (we only care about latest).
1115
+ future: asyncio.Future[bytes] = socket.recv(flags=zmq.NOBLOCK)
1116
+ if isinstance(future.exception(), zmq.Again):
1117
+ break
1118
+ buf = future.result()
1119
+ if buf is None:
1120
+ continue
1121
+
1122
+ # Update local load-balancing state.
1123
+ counts, wave, running = msgspec.msgpack.decode(buf)
1124
+ self.current_wave = wave
1125
+ self.engines_running = running
1126
+ if counts is not None:
1127
+ sliced_counts = counts[count_slice]
1128
+ self.lb_engines = sliced_counts
1129
+ logger.debug(
1130
+ "Received counts: %s (%s)", sliced_counts, count_slice
1131
+ )
1132
+
1133
+ resources.stats_update_task = asyncio.create_task(
1134
+ run_engine_stats_update_task()
1135
+ )
1136
+
1137
+ async def add_request_async(self, request: EngineCoreRequest) -> None:
1138
+ self._ensure_stats_update_task()
1139
+
1140
+ request.current_wave = self.current_wave
1141
+ request.client_index = self.client_index
1142
+
1143
+ chosen_engine = self.get_core_engine_for_request(request)
1144
+ to_await = self._send_input(EngineCoreRequestType.ADD, request, chosen_engine)
1145
+ if not self.engines_running:
1146
+ # Notify coordinator that we're sending a request
1147
+ req_msg = msgspec.msgpack.encode(("FIRST_REQ", chosen_engine))
1148
+ await self.first_req_send_socket.send(req_msg)
1149
+
1150
+ await to_await
1151
+
1152
+ self._ensure_output_queue_task()
1153
+
1154
+ def get_core_engine_for_request(self, request: EngineCoreRequest):
1155
+ return self.core_engine
1156
+
1157
+
1158
+ class DPLBAsyncMPClient(DPAsyncMPClient):
1159
+ """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
1160
+ EngineCore. Load-balances between multiple engine processes."""
1161
+
1162
+ def __init__(
1163
+ self,
1164
+ vllm_config: VllmConfig,
1165
+ executor_class: type[Executor],
1166
+ log_stats: bool,
1167
+ client_addresses: dict[str, str] | None = None,
1168
+ client_count: int = 1,
1169
+ client_index: int = 0,
1170
+ ):
1171
+ self.client_count = client_count
1172
+
1173
+ # To route aborts to the correct engine.
1174
+ self.reqs_in_flight: dict[str, EngineIdentity] = {}
1175
+
1176
+ super().__init__(
1177
+ vllm_config,
1178
+ executor_class,
1179
+ log_stats,
1180
+ client_addresses,
1181
+ client_count,
1182
+ client_index,
1183
+ )
1184
+
1185
+ assert len(self.core_engines) > 1
1186
+
1187
+ self.eng_start_index = (
1188
+ len(self.core_engines) * self.client_index
1189
+ ) // client_count
1190
+
1191
+ def get_core_engine_for_request(self, request: EngineCoreRequest) -> EngineIdentity:
1192
+ # Engines are in rank order.
1193
+ if (eng_index := request.data_parallel_rank) is None:
1194
+ current_counts = self.lb_engines
1195
+ # TODO use P2C alg for larger DP sizes
1196
+ num_engines = len(current_counts)
1197
+ min_score = sys.maxsize
1198
+ eng_index = 0
1199
+ for i in range(num_engines):
1200
+ # Start from client_index to help with balancing when engines
1201
+ # are empty.
1202
+ idx = (self.eng_start_index + i) % num_engines
1203
+ waiting, running = current_counts[idx]
1204
+ score = waiting * 4 + running
1205
+ if score < min_score:
1206
+ min_score = score
1207
+ eng_index = idx
1208
+ # Increment local waiting count for better balancing between stats
1209
+ # updates from the coordinator (which happen every 100ms).
1210
+ current_counts[eng_index][0] += self.client_count
1211
+
1212
+ chosen_engine = self.core_engines[eng_index]
1213
+ # Record which engine is chosen for this request, to handle aborts.
1214
+ self.reqs_in_flight[request.request_id] = chosen_engine
1215
+ return chosen_engine
1216
+
1217
+ async def call_utility_async(self, method: str, *args) -> Any:
1218
+ # Only the result from the first engine is returned.
1219
+ return (
1220
+ await asyncio.gather(
1221
+ *[
1222
+ self._call_utility_async(method, *args, engine=engine)
1223
+ for engine in self.core_engines
1224
+ ]
1225
+ )
1226
+ )[0]
1227
+
1228
+ @staticmethod
1229
+ async def process_engine_outputs(
1230
+ self: "DPLBAsyncMPClient", outputs: EngineCoreOutputs
1231
+ ):
1232
+ if outputs.finished_requests and self.reqs_in_flight:
1233
+ for req_id in outputs.finished_requests:
1234
+ self.reqs_in_flight.pop(req_id, None)
1235
+
1236
+ async def abort_requests_async(self, request_ids: list[str]) -> None:
1237
+ if not request_ids or self.resources.engine_dead:
1238
+ return
1239
+
1240
+ if len(request_ids) == 1:
1241
+ # Fast-path common case.
1242
+ if engine := self.reqs_in_flight.get(request_ids[0]):
1243
+ await self._abort_requests(request_ids, engine)
1244
+ return
1245
+
1246
+ by_engine = defaultdict[EngineIdentity, list[str]](list)
1247
+ for req_id in request_ids:
1248
+ if engine := self.reqs_in_flight.get(req_id):
1249
+ by_engine[engine].append(req_id)
1250
+ for engine, req_ids in by_engine.items():
1251
+ await self._abort_requests(req_ids, engine)
1252
+
1253
+ async def _abort_requests(
1254
+ self, request_ids: list[str], engine: EngineIdentity
1255
+ ) -> None:
1256
+ await self._send_input(EngineCoreRequestType.ABORT, request_ids, engine)
1257
+
1258
+ async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
1259
+ """Scale elastic EP data parallel size"""
1260
+ cur_data_parallel_size = len(self.core_engines)
1261
+
1262
+ assert new_data_parallel_size != cur_data_parallel_size, (
1263
+ f"new_data_parallel_size {new_data_parallel_size} must be "
1264
+ f"different from cur_data_parallel_size {cur_data_parallel_size}"
1265
+ )
1266
+
1267
+ assert self.vllm_config.parallel_config.data_parallel_backend == "ray", (
1268
+ "Only ray DP backend supports scaling elastic EP"
1269
+ )
1270
+
1271
+ scale_up = new_data_parallel_size > cur_data_parallel_size
1272
+
1273
+ if scale_up:
1274
+ await self._scale_up_elastic_ep(
1275
+ cur_data_parallel_size, new_data_parallel_size
1276
+ )
1277
+ else:
1278
+ await self._scale_down_elastic_ep(
1279
+ cur_data_parallel_size, new_data_parallel_size
1280
+ )
1281
+
1282
+ async def _scale_up_elastic_ep(
1283
+ self, cur_data_parallel_size: int, new_data_parallel_size: int
1284
+ ) -> None:
1285
+ """Scale up the data parallel size by creating new engine cores
1286
+ and reconfiguring existing ones."""
1287
+ cur_data_parallel_size = len(self.core_engines)
1288
+
1289
+ # Phase 1: Send reconfigure messages to all existing engines and wait
1290
+ # for them to be sent
1291
+ reconfig_futures = []
1292
+ self.vllm_config.parallel_config.data_parallel_master_port = get_open_port()
1293
+ for engine in self.core_engines:
1294
+ reconfig_request = ReconfigureDistributedRequest(
1295
+ new_data_parallel_size=new_data_parallel_size,
1296
+ new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK,
1297
+ new_data_parallel_rank_local=ReconfigureRankType.KEEP_CURRENT_RANK,
1298
+ new_data_parallel_master_ip=self.vllm_config.parallel_config.data_parallel_master_ip,
1299
+ new_data_parallel_master_port=self.vllm_config.parallel_config.data_parallel_master_port,
1300
+ )
1301
+ coro = self._call_utility_async(
1302
+ "reinitialize_distributed", reconfig_request, engine=engine
1303
+ )
1304
+ reconfig_futures.append(asyncio.create_task(coro))
1305
+
1306
+ logger.info("All reconfigure messages sent, starting engine creation")
1307
+
1308
+ # Phase 2: Create new engines now that reconfig messages have been sent
1309
+ # self.resources.engine_manager is guaranteed to be
1310
+ # CoreEngineActorManager for RayDPClient
1311
+ assert isinstance(self.resources.engine_manager, CoreEngineActorManager)
1312
+ self.resources.engine_manager.scale_up_elastic_ep(
1313
+ self.vllm_config, new_data_parallel_size
1314
+ )
1315
+
1316
+ # Create new CoreEngine objects for the new engines
1317
+ new_engine_identities = set()
1318
+ for i in range(cur_data_parallel_size, new_data_parallel_size):
1319
+ new_engine = i.to_bytes(2, "little")
1320
+ self.core_engines.append(new_engine)
1321
+ new_engine_identities.add(new_engine)
1322
+
1323
+ # Wait for ready messages from new engines on the input socket
1324
+ sync_input_socket = zmq.Socket.shadow(self.input_socket)
1325
+ while new_engine_identities:
1326
+ if not sync_input_socket.poll(timeout=600_000):
1327
+ raise TimeoutError(
1328
+ "Timed out waiting for new engines to send initial "
1329
+ "message on input socket."
1330
+ )
1331
+ identity, _ = sync_input_socket.recv_multipart()
1332
+ new_engine_identities.discard(identity)
1333
+
1334
+ # Phase 3: Wait for all existing engines to complete reconfiguration
1335
+ logger.info("Waiting for existing engines to complete reconfiguration")
1336
+ await asyncio.gather(*reconfig_futures)
1337
+
1338
+ # Notify coordinator about scale up through existing
1339
+ # stats_update_task connection
1340
+ self._ensure_stats_update_task()
1341
+ scale_up_marker = msgspec.msgpack.encode(
1342
+ ("SCALE_ELASTIC_EP", new_data_parallel_size)
1343
+ )
1344
+ await self.first_req_send_socket.send(scale_up_marker)
1345
+
1346
+ # Update the parallel config
1347
+ self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
1348
+ logger.info(
1349
+ "[Elastic EP] Scale up completed, new data parallel size: %s",
1350
+ new_data_parallel_size,
1351
+ )
1352
+
1353
+ async def _scale_down_elastic_ep(
1354
+ self, cur_data_parallel_size: int, new_data_parallel_size: int
1355
+ ) -> None:
1356
+ """Scale down the data parallel size by shutting down and
1357
+ reconfiguring existing engine cores."""
1358
+ cur_data_parallel_size = len(self.core_engines)
1359
+
1360
+ self.vllm_config.parallel_config.data_parallel_master_port = get_open_port()
1361
+
1362
+ reconfig_futures = []
1363
+ for cur_dp_rank, engine in enumerate(self.core_engines):
1364
+ reconfig_request = ReconfigureDistributedRequest(
1365
+ new_data_parallel_size=new_data_parallel_size,
1366
+ new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK,
1367
+ new_data_parallel_rank_local=ReconfigureRankType.KEEP_CURRENT_RANK,
1368
+ new_data_parallel_master_ip=self.vllm_config.parallel_config.data_parallel_master_ip,
1369
+ new_data_parallel_master_port=self.vllm_config.parallel_config.data_parallel_master_port,
1370
+ )
1371
+ if cur_dp_rank >= new_data_parallel_size:
1372
+ reconfig_request.new_data_parallel_rank = (
1373
+ ReconfigureRankType.SHUTDOWN_CURRENT_RANK
1374
+ )
1375
+ coro = self._call_utility_async(
1376
+ "reinitialize_distributed", reconfig_request, engine=engine
1377
+ )
1378
+ reconfig_futures.append(asyncio.create_task(coro))
1379
+
1380
+ for _ in range(new_data_parallel_size, cur_data_parallel_size):
1381
+ self.core_engines.pop()
1382
+
1383
+ await asyncio.gather(*reconfig_futures)
1384
+
1385
+ assert isinstance(self.resources.engine_manager, CoreEngineActorManager)
1386
+ self.resources.engine_manager.scale_down_elastic_ep(
1387
+ cur_data_parallel_size, new_data_parallel_size
1388
+ )
1389
+
1390
+ self._ensure_stats_update_task()
1391
+ scale_down_marker = msgspec.msgpack.encode(
1392
+ ("SCALE_ELASTIC_EP", new_data_parallel_size)
1393
+ )
1394
+ await self.first_req_send_socket.send(scale_down_marker)
1395
+
1396
+ self.vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
1397
+ logger.info(
1398
+ "[Elastic EP] Scale down completed, new data parallel size: %s",
1399
+ new_data_parallel_size,
1400
+ )