vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1398) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2044 -0
  5. vllm/_ipex_ops.py +393 -0
  6. vllm/_version.py +34 -0
  7. vllm/assets/__init__.py +0 -0
  8. vllm/assets/audio.py +45 -0
  9. vllm/assets/base.py +41 -0
  10. vllm/assets/image.py +50 -0
  11. vllm/assets/video.py +145 -0
  12. vllm/attention/__init__.py +15 -0
  13. vllm/attention/backends/__init__.py +0 -0
  14. vllm/attention/backends/abstract.py +204 -0
  15. vllm/attention/backends/utils.py +33 -0
  16. vllm/attention/layer.py +645 -0
  17. vllm/attention/layers/__init__.py +0 -0
  18. vllm/attention/layers/chunked_local_attention.py +93 -0
  19. vllm/attention/layers/cross_attention.py +162 -0
  20. vllm/attention/layers/encoder_only_attention.py +86 -0
  21. vllm/attention/ops/__init__.py +0 -0
  22. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  23. vllm/attention/ops/common.py +345 -0
  24. vllm/attention/ops/flashmla.py +192 -0
  25. vllm/attention/ops/merge_attn_states.py +43 -0
  26. vllm/attention/ops/paged_attn.py +262 -0
  27. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  28. vllm/attention/ops/prefix_prefill.py +928 -0
  29. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  30. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  31. vllm/attention/ops/triton_decode_attention.py +691 -0
  32. vllm/attention/ops/triton_flash_attention.py +984 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +175 -0
  35. vllm/attention/ops/triton_unified_attention.py +894 -0
  36. vllm/attention/selector.py +245 -0
  37. vllm/attention/utils/__init__.py +0 -0
  38. vllm/attention/utils/fa_utils.py +85 -0
  39. vllm/attention/utils/kv_sharing_utils.py +33 -0
  40. vllm/beam_search.py +87 -0
  41. vllm/benchmarks/__init__.py +0 -0
  42. vllm/benchmarks/datasets.py +2723 -0
  43. vllm/benchmarks/latency.py +170 -0
  44. vllm/benchmarks/lib/__init__.py +3 -0
  45. vllm/benchmarks/lib/endpoint_request_func.py +533 -0
  46. vllm/benchmarks/lib/ready_checker.py +73 -0
  47. vllm/benchmarks/lib/utils.py +80 -0
  48. vllm/benchmarks/serve.py +1358 -0
  49. vllm/benchmarks/throughput.py +696 -0
  50. vllm/collect_env.py +823 -0
  51. vllm/compilation/__init__.py +0 -0
  52. vllm/compilation/activation_quant_fusion.py +189 -0
  53. vllm/compilation/backends.py +650 -0
  54. vllm/compilation/base_static_graph.py +56 -0
  55. vllm/compilation/collective_fusion.py +1188 -0
  56. vllm/compilation/compiler_interface.py +573 -0
  57. vllm/compilation/counter.py +47 -0
  58. vllm/compilation/cuda_graph.py +199 -0
  59. vllm/compilation/cuda_piecewise_backend.py +117 -0
  60. vllm/compilation/decorators.py +400 -0
  61. vllm/compilation/fix_functionalization.py +205 -0
  62. vllm/compilation/fusion.py +383 -0
  63. vllm/compilation/fusion_attn.py +295 -0
  64. vllm/compilation/fx_utils.py +84 -0
  65. vllm/compilation/inductor_pass.py +136 -0
  66. vllm/compilation/monitor.py +57 -0
  67. vllm/compilation/noop_elimination.py +158 -0
  68. vllm/compilation/pass_manager.py +125 -0
  69. vllm/compilation/post_cleanup.py +20 -0
  70. vllm/compilation/sequence_parallelism.py +478 -0
  71. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  72. vllm/compilation/vllm_inductor_pass.py +156 -0
  73. vllm/compilation/wrapper.py +136 -0
  74. vllm/config/__init__.py +814 -0
  75. vllm/config/cache.py +220 -0
  76. vllm/config/compilation.py +673 -0
  77. vllm/config/device.py +74 -0
  78. vllm/config/kv_events.py +50 -0
  79. vllm/config/kv_transfer.py +111 -0
  80. vllm/config/load.py +113 -0
  81. vllm/config/lora.py +132 -0
  82. vllm/config/model.py +1912 -0
  83. vllm/config/multimodal.py +129 -0
  84. vllm/config/observability.py +99 -0
  85. vllm/config/parallel.py +524 -0
  86. vllm/config/pooler.py +97 -0
  87. vllm/config/scheduler.py +287 -0
  88. vllm/config/speculative.py +568 -0
  89. vllm/config/speech_to_text.py +39 -0
  90. vllm/config/structured_outputs.py +64 -0
  91. vllm/config/utils.py +145 -0
  92. vllm/connections.py +186 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +311 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +41 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +440 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +317 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +295 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +323 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +28 -0
  106. vllm/distributed/device_communicators/pynccl.py +340 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +186 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +416 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +589 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +635 -0
  113. vllm/distributed/device_communicators/symm_mem.py +136 -0
  114. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  115. vllm/distributed/device_communicators/xpu_communicator.py +94 -0
  116. vllm/distributed/eplb/__init__.py +8 -0
  117. vllm/distributed/eplb/eplb_state.py +620 -0
  118. vllm/distributed/eplb/rebalance_algo.py +239 -0
  119. vllm/distributed/eplb/rebalance_execute.py +424 -0
  120. vllm/distributed/kv_events.py +362 -0
  121. vllm/distributed/kv_transfer/README.md +29 -0
  122. vllm/distributed/kv_transfer/__init__.py +13 -0
  123. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  124. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  125. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  126. vllm/distributed/kv_transfer/kv_connector/factory.py +113 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +261 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +388 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +168 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +100 -0
  132. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +328 -0
  133. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1473 -0
  134. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +485 -0
  135. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +488 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +550 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +267 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +418 -0
  140. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  141. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  142. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  144. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  145. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  146. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  147. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  148. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  149. vllm/distributed/parallel_state.py +1532 -0
  150. vllm/distributed/tpu_distributed_utils.py +178 -0
  151. vllm/distributed/utils.py +536 -0
  152. vllm/engine/__init__.py +0 -0
  153. vllm/engine/arg_utils.py +1778 -0
  154. vllm/engine/async_llm_engine.py +6 -0
  155. vllm/engine/llm_engine.py +6 -0
  156. vllm/engine/metrics.py +577 -0
  157. vllm/engine/metrics_types.py +84 -0
  158. vllm/engine/protocol.py +333 -0
  159. vllm/entrypoints/__init__.py +0 -0
  160. vllm/entrypoints/api_server.py +178 -0
  161. vllm/entrypoints/chat_utils.py +1705 -0
  162. vllm/entrypoints/cli/__init__.py +12 -0
  163. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  164. vllm/entrypoints/cli/benchmark/base.py +25 -0
  165. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  166. vllm/entrypoints/cli/benchmark/main.py +55 -0
  167. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  168. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  169. vllm/entrypoints/cli/collect_env.py +36 -0
  170. vllm/entrypoints/cli/main.py +60 -0
  171. vllm/entrypoints/cli/openai.py +233 -0
  172. vllm/entrypoints/cli/run_batch.py +67 -0
  173. vllm/entrypoints/cli/serve.py +232 -0
  174. vllm/entrypoints/cli/types.py +29 -0
  175. vllm/entrypoints/constants.py +10 -0
  176. vllm/entrypoints/context.py +481 -0
  177. vllm/entrypoints/harmony_utils.py +436 -0
  178. vllm/entrypoints/launcher.py +164 -0
  179. vllm/entrypoints/llm.py +1629 -0
  180. vllm/entrypoints/logger.py +79 -0
  181. vllm/entrypoints/openai/__init__.py +0 -0
  182. vllm/entrypoints/openai/api_server.py +1953 -0
  183. vllm/entrypoints/openai/cli_args.py +288 -0
  184. vllm/entrypoints/openai/logits_processors.py +90 -0
  185. vllm/entrypoints/openai/protocol.py +2757 -0
  186. vllm/entrypoints/openai/run_batch.py +491 -0
  187. vllm/entrypoints/openai/serving_chat.py +1597 -0
  188. vllm/entrypoints/openai/serving_classification.py +173 -0
  189. vllm/entrypoints/openai/serving_completion.py +692 -0
  190. vllm/entrypoints/openai/serving_embedding.py +631 -0
  191. vllm/entrypoints/openai/serving_engine.py +992 -0
  192. vllm/entrypoints/openai/serving_models.py +288 -0
  193. vllm/entrypoints/openai/serving_pooling.py +276 -0
  194. vllm/entrypoints/openai/serving_responses.py +1709 -0
  195. vllm/entrypoints/openai/serving_score.py +479 -0
  196. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  197. vllm/entrypoints/openai/serving_transcription.py +136 -0
  198. vllm/entrypoints/openai/speech_to_text.py +388 -0
  199. vllm/entrypoints/openai/tool_parsers/__init__.py +55 -0
  200. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  201. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  202. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  203. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  204. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  205. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  206. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +455 -0
  207. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  208. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  209. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  210. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  211. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  212. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  213. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +39 -0
  214. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  216. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +93 -0
  217. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  218. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  219. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  220. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1137 -0
  221. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  222. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  223. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  224. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  225. vllm/entrypoints/renderer.py +395 -0
  226. vllm/entrypoints/score_utils.py +232 -0
  227. vllm/entrypoints/ssl.py +75 -0
  228. vllm/entrypoints/tool.py +139 -0
  229. vllm/entrypoints/tool_server.py +206 -0
  230. vllm/entrypoints/utils.py +233 -0
  231. vllm/env_override.py +23 -0
  232. vllm/envs.py +1590 -0
  233. vllm/executor/__init__.py +0 -0
  234. vllm/executor/executor_base.py +381 -0
  235. vllm/executor/msgspec_utils.py +35 -0
  236. vllm/executor/ray_distributed_executor.py +699 -0
  237. vllm/executor/ray_utils.py +410 -0
  238. vllm/executor/uniproc_executor.py +176 -0
  239. vllm/forward_context.py +402 -0
  240. vllm/inputs/__init__.py +30 -0
  241. vllm/inputs/data.py +356 -0
  242. vllm/inputs/parse.py +151 -0
  243. vllm/inputs/preprocess.py +664 -0
  244. vllm/logger.py +229 -0
  245. vllm/logging_utils/__init__.py +10 -0
  246. vllm/logging_utils/dump_input.py +81 -0
  247. vllm/logging_utils/formatter.py +79 -0
  248. vllm/logging_utils/log_time.py +32 -0
  249. vllm/logits_process.py +119 -0
  250. vllm/logprobs.py +28 -0
  251. vllm/lora/__init__.py +0 -0
  252. vllm/lora/layers/__init__.py +34 -0
  253. vllm/lora/layers/base.py +69 -0
  254. vllm/lora/layers/base_linear.py +185 -0
  255. vllm/lora/layers/column_parallel_linear.py +609 -0
  256. vllm/lora/layers/logits_processor.py +247 -0
  257. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  258. vllm/lora/layers/replicated_linear.py +60 -0
  259. vllm/lora/layers/row_parallel_linear.py +196 -0
  260. vllm/lora/layers/utils.py +65 -0
  261. vllm/lora/layers/vocal_parallel_embedding.py +174 -0
  262. vllm/lora/lora_weights.py +199 -0
  263. vllm/lora/models.py +816 -0
  264. vllm/lora/ops/__init__.py +0 -0
  265. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  266. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  267. vllm/lora/ops/torch_ops/__init__.py +16 -0
  268. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  269. vllm/lora/ops/triton_ops/__init__.py +12 -0
  270. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  271. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  272. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  273. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  274. vllm/lora/ops/triton_ops/utils.py +126 -0
  275. vllm/lora/ops/xla_ops/__init__.py +7 -0
  276. vllm/lora/ops/xla_ops/lora_ops.py +144 -0
  277. vllm/lora/peft_helper.py +127 -0
  278. vllm/lora/punica_wrapper/__init__.py +10 -0
  279. vllm/lora/punica_wrapper/punica_base.py +458 -0
  280. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  281. vllm/lora/punica_wrapper/punica_gpu.py +272 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  284. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  285. vllm/lora/punica_wrapper/utils.py +136 -0
  286. vllm/lora/request.py +97 -0
  287. vllm/lora/resolver.py +85 -0
  288. vllm/lora/utils.py +246 -0
  289. vllm/lora/worker_manager.py +267 -0
  290. vllm/model_executor/__init__.py +12 -0
  291. vllm/model_executor/custom_op.py +194 -0
  292. vllm/model_executor/layers/__init__.py +0 -0
  293. vllm/model_executor/layers/activation.py +575 -0
  294. vllm/model_executor/layers/attention_layer_base.py +23 -0
  295. vllm/model_executor/layers/fla/__init__.py +8 -0
  296. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  297. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  298. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  299. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  300. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  301. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  302. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  303. vllm/model_executor/layers/fla/ops/index.py +39 -0
  304. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  305. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  306. vllm/model_executor/layers/fla/ops/op.py +39 -0
  307. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  308. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  309. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  310. vllm/model_executor/layers/fused_moe/__init__.py +89 -0
  311. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +322 -0
  312. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +141 -0
  313. vllm/model_executor/layers/fused_moe/config.py +804 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  545. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +300 -0
  546. vllm/model_executor/layers/fused_moe/cutlass_moe.py +957 -0
  547. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +362 -0
  548. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  549. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +361 -0
  550. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +274 -0
  551. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +268 -0
  552. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +300 -0
  553. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +184 -0
  554. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +993 -0
  555. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +239 -0
  556. vllm/model_executor/layers/fused_moe/fused_moe.py +1890 -0
  557. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +307 -0
  558. vllm/model_executor/layers/fused_moe/layer.py +2195 -0
  559. vllm/model_executor/layers/fused_moe/modular_kernel.py +1038 -0
  560. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  561. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  562. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  563. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  564. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +341 -0
  565. vllm/model_executor/layers/fused_moe/prepare_finalize.py +70 -0
  566. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +424 -0
  567. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  568. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  569. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +143 -0
  570. vllm/model_executor/layers/fused_moe/trtllm_moe.py +191 -0
  571. vllm/model_executor/layers/fused_moe/utils.py +274 -0
  572. vllm/model_executor/layers/layernorm.py +395 -0
  573. vllm/model_executor/layers/lightning_attn.py +661 -0
  574. vllm/model_executor/layers/linear.py +1603 -0
  575. vllm/model_executor/layers/logits_processor.py +106 -0
  576. vllm/model_executor/layers/mamba/__init__.py +0 -0
  577. vllm/model_executor/layers/mamba/abstract.py +42 -0
  578. vllm/model_executor/layers/mamba/linear_attn.py +403 -0
  579. vllm/model_executor/layers/mamba/mamba_mixer.py +466 -0
  580. vllm/model_executor/layers/mamba/mamba_mixer2.py +764 -0
  581. vllm/model_executor/layers/mamba/mamba_utils.py +186 -0
  582. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  583. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1092 -0
  584. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  585. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  586. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +242 -0
  587. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +527 -0
  588. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +724 -0
  589. vllm/model_executor/layers/mamba/ops/ssd_combined.py +238 -0
  590. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +200 -0
  591. vllm/model_executor/layers/mamba/short_conv.py +253 -0
  592. vllm/model_executor/layers/mla.py +173 -0
  593. vllm/model_executor/layers/pooler.py +719 -0
  594. vllm/model_executor/layers/quantization/__init__.py +157 -0
  595. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  596. vllm/model_executor/layers/quantization/awq.py +228 -0
  597. vllm/model_executor/layers/quantization/awq_marlin.py +554 -0
  598. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  599. vllm/model_executor/layers/quantization/base_config.py +170 -0
  600. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  601. vllm/model_executor/layers/quantization/bitsandbytes.py +627 -0
  602. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  603. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +797 -0
  604. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2074 -0
  605. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  606. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  607. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  608. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  609. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  610. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +185 -0
  611. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  612. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  613. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  614. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +157 -0
  615. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  616. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +238 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +153 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +46 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  625. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  626. vllm/model_executor/layers/quantization/experts_int8.py +223 -0
  627. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  628. vllm/model_executor/layers/quantization/fp8.py +1098 -0
  629. vllm/model_executor/layers/quantization/gguf.py +599 -0
  630. vllm/model_executor/layers/quantization/gptq.py +340 -0
  631. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  632. vllm/model_executor/layers/quantization/gptq_marlin.py +751 -0
  633. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  634. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  635. vllm/model_executor/layers/quantization/inc.py +61 -0
  636. vllm/model_executor/layers/quantization/input_quant_fp8.py +156 -0
  637. vllm/model_executor/layers/quantization/ipex_quant.py +415 -0
  638. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  639. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  640. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  641. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  642. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  643. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  644. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  645. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  646. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  647. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  648. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  649. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  650. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  651. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +161 -0
  652. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  653. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  654. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  655. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  656. vllm/model_executor/layers/quantization/kv_cache.py +143 -0
  657. vllm/model_executor/layers/quantization/modelopt.py +1596 -0
  658. vllm/model_executor/layers/quantization/moe_wna16.py +484 -0
  659. vllm/model_executor/layers/quantization/mxfp4.py +988 -0
  660. vllm/model_executor/layers/quantization/petit.py +306 -0
  661. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  662. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  663. vllm/model_executor/layers/quantization/quark/quark.py +432 -0
  664. vllm/model_executor/layers/quantization/quark/quark_moe.py +561 -0
  665. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  666. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  667. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +239 -0
  668. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  669. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  670. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  671. vllm/model_executor/layers/quantization/rtn.py +466 -0
  672. vllm/model_executor/layers/quantization/schema.py +86 -0
  673. vllm/model_executor/layers/quantization/torchao.py +214 -0
  674. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  675. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  676. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  677. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  888. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  889. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +79 -0
  890. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +248 -0
  891. vllm/model_executor/layers/quantization/utils/fp8_utils.py +949 -0
  892. vllm/model_executor/layers/quantization/utils/gptq_utils.py +146 -0
  893. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  894. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  895. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  896. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  897. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  898. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  899. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  900. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  901. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +141 -0
  902. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  903. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  904. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  905. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  906. vllm/model_executor/layers/quantization/utils/quant_utils.py +641 -0
  907. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  908. vllm/model_executor/layers/resampler.py +270 -0
  909. vllm/model_executor/layers/rotary_embedding/__init__.py +204 -0
  910. vllm/model_executor/layers/rotary_embedding/base.py +177 -0
  911. vllm/model_executor/layers/rotary_embedding/common.py +150 -0
  912. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +138 -0
  913. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  914. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  915. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  916. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  917. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  918. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  919. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  920. vllm/model_executor/layers/rotary_embedding/mrope.py +1321 -0
  921. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  922. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  923. vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py +86 -0
  924. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  925. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  926. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  927. vllm/model_executor/layers/utils.py +195 -0
  928. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  929. vllm/model_executor/model_loader/__init__.py +138 -0
  930. vllm/model_executor/model_loader/base_loader.py +52 -0
  931. vllm/model_executor/model_loader/bitsandbytes_loader.py +788 -0
  932. vllm/model_executor/model_loader/default_loader.py +277 -0
  933. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  934. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  935. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  936. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  937. vllm/model_executor/model_loader/tensorizer.py +738 -0
  938. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  939. vllm/model_executor/model_loader/tpu.py +114 -0
  940. vllm/model_executor/model_loader/utils.py +292 -0
  941. vllm/model_executor/model_loader/weight_utils.py +990 -0
  942. vllm/model_executor/models/__init__.py +33 -0
  943. vllm/model_executor/models/adapters.py +542 -0
  944. vllm/model_executor/models/aimv2.py +246 -0
  945. vllm/model_executor/models/apertus.py +579 -0
  946. vllm/model_executor/models/arcee.py +422 -0
  947. vllm/model_executor/models/arctic.py +558 -0
  948. vllm/model_executor/models/aria.py +650 -0
  949. vllm/model_executor/models/aya_vision.py +468 -0
  950. vllm/model_executor/models/baichuan.py +474 -0
  951. vllm/model_executor/models/bailing_moe.py +642 -0
  952. vllm/model_executor/models/bamba.py +514 -0
  953. vllm/model_executor/models/bert.py +665 -0
  954. vllm/model_executor/models/bert_with_rope.py +687 -0
  955. vllm/model_executor/models/blip.py +339 -0
  956. vllm/model_executor/models/blip2.py +712 -0
  957. vllm/model_executor/models/bloom.py +374 -0
  958. vllm/model_executor/models/chameleon.py +1139 -0
  959. vllm/model_executor/models/chatglm.py +476 -0
  960. vllm/model_executor/models/clip.py +407 -0
  961. vllm/model_executor/models/cohere2_vision.py +481 -0
  962. vllm/model_executor/models/commandr.py +465 -0
  963. vllm/model_executor/models/config.py +445 -0
  964. vllm/model_executor/models/dbrx.py +471 -0
  965. vllm/model_executor/models/deepseek.py +497 -0
  966. vllm/model_executor/models/deepseek_eagle.py +240 -0
  967. vllm/model_executor/models/deepseek_mtp.py +289 -0
  968. vllm/model_executor/models/deepseek_v2.py +1444 -0
  969. vllm/model_executor/models/deepseek_vl2.py +658 -0
  970. vllm/model_executor/models/dots1.py +546 -0
  971. vllm/model_executor/models/dots_ocr.py +873 -0
  972. vllm/model_executor/models/ernie45.py +43 -0
  973. vllm/model_executor/models/ernie45_moe.py +607 -0
  974. vllm/model_executor/models/ernie45_vl.py +1527 -0
  975. vllm/model_executor/models/ernie45_vl_moe.py +727 -0
  976. vllm/model_executor/models/ernie_mtp.py +268 -0
  977. vllm/model_executor/models/exaone.py +550 -0
  978. vllm/model_executor/models/exaone4.py +533 -0
  979. vllm/model_executor/models/fairseq2_llama.py +154 -0
  980. vllm/model_executor/models/falcon.py +509 -0
  981. vllm/model_executor/models/falcon_h1.py +674 -0
  982. vllm/model_executor/models/fuyu.py +399 -0
  983. vllm/model_executor/models/gemma.py +425 -0
  984. vllm/model_executor/models/gemma2.py +422 -0
  985. vllm/model_executor/models/gemma3.py +555 -0
  986. vllm/model_executor/models/gemma3_mm.py +721 -0
  987. vllm/model_executor/models/gemma3n.py +1113 -0
  988. vllm/model_executor/models/gemma3n_mm.py +761 -0
  989. vllm/model_executor/models/glm.py +23 -0
  990. vllm/model_executor/models/glm4.py +304 -0
  991. vllm/model_executor/models/glm4_1v.py +1690 -0
  992. vllm/model_executor/models/glm4_moe.py +727 -0
  993. vllm/model_executor/models/glm4_moe_mtp.py +301 -0
  994. vllm/model_executor/models/glm4v.py +654 -0
  995. vllm/model_executor/models/gpt2.py +380 -0
  996. vllm/model_executor/models/gpt_bigcode.py +344 -0
  997. vllm/model_executor/models/gpt_j.py +339 -0
  998. vllm/model_executor/models/gpt_neox.py +330 -0
  999. vllm/model_executor/models/gpt_oss.py +712 -0
  1000. vllm/model_executor/models/granite.py +489 -0
  1001. vllm/model_executor/models/granite_speech.py +794 -0
  1002. vllm/model_executor/models/granitemoe.py +550 -0
  1003. vllm/model_executor/models/granitemoehybrid.py +614 -0
  1004. vllm/model_executor/models/granitemoeshared.py +332 -0
  1005. vllm/model_executor/models/gritlm.py +262 -0
  1006. vllm/model_executor/models/grok1.py +547 -0
  1007. vllm/model_executor/models/h2ovl.py +536 -0
  1008. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1009. vllm/model_executor/models/hyperclovax_vision.py +1192 -0
  1010. vllm/model_executor/models/idefics2_vision_model.py +417 -0
  1011. vllm/model_executor/models/idefics3.py +756 -0
  1012. vllm/model_executor/models/interfaces.py +959 -0
  1013. vllm/model_executor/models/interfaces_base.py +192 -0
  1014. vllm/model_executor/models/intern_vit.py +441 -0
  1015. vllm/model_executor/models/internlm2.py +450 -0
  1016. vllm/model_executor/models/internlm2_ve.py +148 -0
  1017. vllm/model_executor/models/interns1.py +838 -0
  1018. vllm/model_executor/models/interns1_vit.py +418 -0
  1019. vllm/model_executor/models/internvl.py +1423 -0
  1020. vllm/model_executor/models/jais.py +373 -0
  1021. vllm/model_executor/models/jamba.py +591 -0
  1022. vllm/model_executor/models/jina_vl.py +144 -0
  1023. vllm/model_executor/models/keye.py +1680 -0
  1024. vllm/model_executor/models/keye_vl1_5.py +602 -0
  1025. vllm/model_executor/models/kimi_vl.py +618 -0
  1026. vllm/model_executor/models/lfm2.py +548 -0
  1027. vllm/model_executor/models/llama.py +669 -0
  1028. vllm/model_executor/models/llama4.py +746 -0
  1029. vllm/model_executor/models/llama4_eagle.py +239 -0
  1030. vllm/model_executor/models/llama_eagle.py +179 -0
  1031. vllm/model_executor/models/llama_eagle3.py +296 -0
  1032. vllm/model_executor/models/llava.py +870 -0
  1033. vllm/model_executor/models/llava_next.py +571 -0
  1034. vllm/model_executor/models/llava_next_video.py +476 -0
  1035. vllm/model_executor/models/llava_onevision.py +942 -0
  1036. vllm/model_executor/models/longcat_flash.py +715 -0
  1037. vllm/model_executor/models/longcat_flash_mtp.py +352 -0
  1038. vllm/model_executor/models/mamba.py +275 -0
  1039. vllm/model_executor/models/mamba2.py +291 -0
  1040. vllm/model_executor/models/medusa.py +169 -0
  1041. vllm/model_executor/models/midashenglm.py +792 -0
  1042. vllm/model_executor/models/mimo.py +188 -0
  1043. vllm/model_executor/models/mimo_mtp.py +280 -0
  1044. vllm/model_executor/models/minicpm.py +631 -0
  1045. vllm/model_executor/models/minicpm3.py +230 -0
  1046. vllm/model_executor/models/minicpm_eagle.py +389 -0
  1047. vllm/model_executor/models/minicpmo.py +770 -0
  1048. vllm/model_executor/models/minicpmv.py +1784 -0
  1049. vllm/model_executor/models/minimax_text_01.py +986 -0
  1050. vllm/model_executor/models/minimax_vl_01.py +426 -0
  1051. vllm/model_executor/models/mistral3.py +628 -0
  1052. vllm/model_executor/models/mixtral.py +606 -0
  1053. vllm/model_executor/models/mllama4.py +1076 -0
  1054. vllm/model_executor/models/mlp_speculator.py +206 -0
  1055. vllm/model_executor/models/modernbert.py +374 -0
  1056. vllm/model_executor/models/module_mapping.py +72 -0
  1057. vllm/model_executor/models/molmo.py +1567 -0
  1058. vllm/model_executor/models/moonvit.py +673 -0
  1059. vllm/model_executor/models/motif.py +345 -0
  1060. vllm/model_executor/models/mpt.py +329 -0
  1061. vllm/model_executor/models/nano_nemotron_vl.py +1394 -0
  1062. vllm/model_executor/models/nemotron.py +507 -0
  1063. vllm/model_executor/models/nemotron_h.py +565 -0
  1064. vllm/model_executor/models/nemotron_nas.py +481 -0
  1065. vllm/model_executor/models/nemotron_vl.py +652 -0
  1066. vllm/model_executor/models/nvlm_d.py +203 -0
  1067. vllm/model_executor/models/olmo.py +404 -0
  1068. vllm/model_executor/models/olmo2.py +439 -0
  1069. vllm/model_executor/models/olmoe.py +483 -0
  1070. vllm/model_executor/models/opt.py +412 -0
  1071. vllm/model_executor/models/orion.py +348 -0
  1072. vllm/model_executor/models/ovis.py +559 -0
  1073. vllm/model_executor/models/ovis2_5.py +642 -0
  1074. vllm/model_executor/models/paligemma.py +411 -0
  1075. vllm/model_executor/models/persimmon.py +343 -0
  1076. vllm/model_executor/models/phi.py +356 -0
  1077. vllm/model_executor/models/phi3.py +19 -0
  1078. vllm/model_executor/models/phi3v.py +698 -0
  1079. vllm/model_executor/models/phi4_multimodal.py +1475 -0
  1080. vllm/model_executor/models/phi4mm.py +1279 -0
  1081. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1082. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1083. vllm/model_executor/models/phimoe.py +679 -0
  1084. vllm/model_executor/models/pixtral.py +1345 -0
  1085. vllm/model_executor/models/plamo2.py +978 -0
  1086. vllm/model_executor/models/qwen.py +361 -0
  1087. vllm/model_executor/models/qwen2.py +523 -0
  1088. vllm/model_executor/models/qwen2_5_omni_thinker.py +984 -0
  1089. vllm/model_executor/models/qwen2_5_vl.py +1481 -0
  1090. vllm/model_executor/models/qwen2_audio.py +489 -0
  1091. vllm/model_executor/models/qwen2_moe.py +558 -0
  1092. vllm/model_executor/models/qwen2_rm.py +122 -0
  1093. vllm/model_executor/models/qwen2_vl.py +1670 -0
  1094. vllm/model_executor/models/qwen3.py +341 -0
  1095. vllm/model_executor/models/qwen3_moe.py +692 -0
  1096. vllm/model_executor/models/qwen3_next.py +1266 -0
  1097. vllm/model_executor/models/qwen3_next_mtp.py +281 -0
  1098. vllm/model_executor/models/qwen3_vl.py +1613 -0
  1099. vllm/model_executor/models/qwen3_vl_moe.py +358 -0
  1100. vllm/model_executor/models/qwen_vl.py +795 -0
  1101. vllm/model_executor/models/radio.py +576 -0
  1102. vllm/model_executor/models/registry.py +990 -0
  1103. vllm/model_executor/models/roberta.py +252 -0
  1104. vllm/model_executor/models/rvl.py +103 -0
  1105. vllm/model_executor/models/seed_oss.py +485 -0
  1106. vllm/model_executor/models/siglip.py +540 -0
  1107. vllm/model_executor/models/siglip2navit.py +689 -0
  1108. vllm/model_executor/models/skyworkr1v.py +911 -0
  1109. vllm/model_executor/models/smolvlm.py +44 -0
  1110. vllm/model_executor/models/solar.py +504 -0
  1111. vllm/model_executor/models/stablelm.py +341 -0
  1112. vllm/model_executor/models/starcoder2.py +354 -0
  1113. vllm/model_executor/models/step3_text.py +510 -0
  1114. vllm/model_executor/models/step3_vl.py +1072 -0
  1115. vllm/model_executor/models/swin.py +475 -0
  1116. vllm/model_executor/models/tarsier.py +639 -0
  1117. vllm/model_executor/models/telechat2.py +151 -0
  1118. vllm/model_executor/models/teleflm.py +79 -0
  1119. vllm/model_executor/models/terratorch.py +294 -0
  1120. vllm/model_executor/models/transformers.py +948 -0
  1121. vllm/model_executor/models/ultravox.py +654 -0
  1122. vllm/model_executor/models/utils.py +808 -0
  1123. vllm/model_executor/models/vision.py +404 -0
  1124. vllm/model_executor/models/voxtral.py +786 -0
  1125. vllm/model_executor/models/whisper.py +963 -0
  1126. vllm/model_executor/models/zamba2.py +960 -0
  1127. vllm/model_executor/parameter.py +620 -0
  1128. vllm/model_executor/utils.py +86 -0
  1129. vllm/model_executor/warmup/__init__.py +0 -0
  1130. vllm/model_executor/warmup/deep_gemm_warmup.py +230 -0
  1131. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1132. vllm/multimodal/__init__.py +33 -0
  1133. vllm/multimodal/audio.py +116 -0
  1134. vllm/multimodal/base.py +27 -0
  1135. vllm/multimodal/cache.py +697 -0
  1136. vllm/multimodal/evs.py +273 -0
  1137. vllm/multimodal/hasher.py +102 -0
  1138. vllm/multimodal/image.py +130 -0
  1139. vllm/multimodal/inputs.py +987 -0
  1140. vllm/multimodal/parse.py +511 -0
  1141. vllm/multimodal/processing.py +2148 -0
  1142. vllm/multimodal/profiling.py +284 -0
  1143. vllm/multimodal/registry.py +345 -0
  1144. vllm/multimodal/utils.py +503 -0
  1145. vllm/multimodal/video.py +319 -0
  1146. vllm/outputs.py +324 -0
  1147. vllm/platforms/__init__.py +263 -0
  1148. vllm/platforms/cpu.py +340 -0
  1149. vllm/platforms/cuda.py +668 -0
  1150. vllm/platforms/interface.py +620 -0
  1151. vllm/platforms/rocm.py +497 -0
  1152. vllm/platforms/tpu.py +233 -0
  1153. vllm/platforms/xpu.py +243 -0
  1154. vllm/plugins/__init__.py +72 -0
  1155. vllm/plugins/io_processors/__init__.py +68 -0
  1156. vllm/plugins/io_processors/interface.py +67 -0
  1157. vllm/plugins/lora_resolvers/README.md +16 -0
  1158. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1159. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1160. vllm/pooling_params.py +191 -0
  1161. vllm/profiler/__init__.py +0 -0
  1162. vllm/profiler/layerwise_profile.py +375 -0
  1163. vllm/profiler/utils.py +148 -0
  1164. vllm/py.typed +2 -0
  1165. vllm/ray/__init__.py +0 -0
  1166. vllm/ray/lazy_utils.py +22 -0
  1167. vllm/ray/ray_env.py +72 -0
  1168. vllm/reasoning/__init__.py +29 -0
  1169. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1170. vllm/reasoning/basic_parsers.py +156 -0
  1171. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1172. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1173. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1174. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1175. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1176. vllm/reasoning/mistral_reasoning_parser.py +56 -0
  1177. vllm/reasoning/qwen3_reasoning_parser.py +72 -0
  1178. vllm/reasoning/seedoss_reasoning_parser.py +28 -0
  1179. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1180. vllm/sampling_params.py +593 -0
  1181. vllm/scalar_type.py +349 -0
  1182. vllm/scripts.py +15 -0
  1183. vllm/sequence.py +103 -0
  1184. vllm/tasks.py +11 -0
  1185. vllm/test_utils.py +129 -0
  1186. vllm/third_party/__init__.py +0 -0
  1187. vllm/third_party/pynvml.py +6140 -0
  1188. vllm/tracing.py +136 -0
  1189. vllm/transformers_utils/__init__.py +24 -0
  1190. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1191. vllm/transformers_utils/chat_templates/registry.py +70 -0
  1192. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1193. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1194. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1195. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1196. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1197. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1198. vllm/transformers_utils/config.py +1102 -0
  1199. vllm/transformers_utils/config_parser_base.py +20 -0
  1200. vllm/transformers_utils/configs/__init__.py +63 -0
  1201. vllm/transformers_utils/configs/arctic.py +207 -0
  1202. vllm/transformers_utils/configs/chatglm.py +72 -0
  1203. vllm/transformers_utils/configs/deepseek_v3.py +101 -0
  1204. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1205. vllm/transformers_utils/configs/dotsocr.py +69 -0
  1206. vllm/transformers_utils/configs/eagle.py +84 -0
  1207. vllm/transformers_utils/configs/falcon.py +90 -0
  1208. vllm/transformers_utils/configs/jais.py +237 -0
  1209. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1210. vllm/transformers_utils/configs/medusa.py +63 -0
  1211. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1212. vllm/transformers_utils/configs/mistral.py +165 -0
  1213. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1214. vllm/transformers_utils/configs/moonvit.py +33 -0
  1215. vllm/transformers_utils/configs/nemotron.py +205 -0
  1216. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1217. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1218. vllm/transformers_utils/configs/olmo3.py +80 -0
  1219. vllm/transformers_utils/configs/ovis.py +176 -0
  1220. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1221. vllm/transformers_utils/configs/radio.py +91 -0
  1222. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1223. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1224. vllm/transformers_utils/configs/speculators/base.py +111 -0
  1225. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1226. vllm/transformers_utils/configs/ultravox.py +116 -0
  1227. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1228. vllm/transformers_utils/dynamic_module.py +60 -0
  1229. vllm/transformers_utils/processor.py +299 -0
  1230. vllm/transformers_utils/processors/__init__.py +16 -0
  1231. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1232. vllm/transformers_utils/processors/ovis.py +420 -0
  1233. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1234. vllm/transformers_utils/runai_utils.py +104 -0
  1235. vllm/transformers_utils/s3_utils.py +93 -0
  1236. vllm/transformers_utils/tokenizer.py +292 -0
  1237. vllm/transformers_utils/tokenizer_base.py +154 -0
  1238. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1239. vllm/transformers_utils/tokenizers/mistral.py +521 -0
  1240. vllm/transformers_utils/utils.py +108 -0
  1241. vllm/triton_utils/__init__.py +16 -0
  1242. vllm/triton_utils/importing.py +96 -0
  1243. vllm/usage/__init__.py +0 -0
  1244. vllm/usage/usage_lib.py +259 -0
  1245. vllm/utils/__init__.py +3566 -0
  1246. vllm/utils/deep_gemm.py +319 -0
  1247. vllm/utils/flashinfer.py +443 -0
  1248. vllm/utils/jsontree.py +178 -0
  1249. vllm/utils/tensor_schema.py +235 -0
  1250. vllm/v1/__init__.py +0 -0
  1251. vllm/v1/attention/__init__.py +0 -0
  1252. vllm/v1/attention/backends/__init__.py +0 -0
  1253. vllm/v1/attention/backends/cpu_attn.py +919 -0
  1254. vllm/v1/attention/backends/flash_attn.py +795 -0
  1255. vllm/v1/attention/backends/flashinfer.py +1181 -0
  1256. vllm/v1/attention/backends/flex_attention.py +861 -0
  1257. vllm/v1/attention/backends/gdn_attn.py +332 -0
  1258. vllm/v1/attention/backends/linear_attn.py +67 -0
  1259. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1260. vllm/v1/attention/backends/mamba2_attn.py +232 -0
  1261. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1262. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1263. vllm/v1/attention/backends/mla/common.py +1783 -0
  1264. vllm/v1/attention/backends/mla/cutlass_mla.py +248 -0
  1265. vllm/v1/attention/backends/mla/flashattn_mla.py +271 -0
  1266. vllm/v1/attention/backends/mla/flashinfer_mla.py +114 -0
  1267. vllm/v1/attention/backends/mla/flashmla.py +203 -0
  1268. vllm/v1/attention/backends/mla/flashmla_sparse.py +544 -0
  1269. vllm/v1/attention/backends/mla/indexer.py +342 -0
  1270. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1271. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1272. vllm/v1/attention/backends/pallas.py +409 -0
  1273. vllm/v1/attention/backends/rocm_aiter_fa.py +549 -0
  1274. vllm/v1/attention/backends/rocm_attn.py +426 -0
  1275. vllm/v1/attention/backends/short_conv_attn.py +94 -0
  1276. vllm/v1/attention/backends/tree_attn.py +451 -0
  1277. vllm/v1/attention/backends/triton_attn.py +361 -0
  1278. vllm/v1/attention/backends/utils.py +990 -0
  1279. vllm/v1/attention/backends/xformers.py +438 -0
  1280. vllm/v1/core/__init__.py +0 -0
  1281. vllm/v1/core/block_pool.py +416 -0
  1282. vllm/v1/core/encoder_cache_manager.py +333 -0
  1283. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1284. vllm/v1/core/kv_cache_manager.py +399 -0
  1285. vllm/v1/core/kv_cache_utils.py +1291 -0
  1286. vllm/v1/core/sched/__init__.py +0 -0
  1287. vllm/v1/core/sched/async_scheduler.py +47 -0
  1288. vllm/v1/core/sched/interface.py +158 -0
  1289. vllm/v1/core/sched/output.py +166 -0
  1290. vllm/v1/core/sched/request_queue.py +224 -0
  1291. vllm/v1/core/sched/scheduler.py +1296 -0
  1292. vllm/v1/core/sched/utils.py +69 -0
  1293. vllm/v1/core/single_type_kv_cache_manager.py +671 -0
  1294. vllm/v1/cudagraph_dispatcher.py +125 -0
  1295. vllm/v1/engine/__init__.py +203 -0
  1296. vllm/v1/engine/async_llm.py +742 -0
  1297. vllm/v1/engine/coordinator.py +357 -0
  1298. vllm/v1/engine/core.py +1235 -0
  1299. vllm/v1/engine/core_client.py +1334 -0
  1300. vllm/v1/engine/detokenizer.py +349 -0
  1301. vllm/v1/engine/exceptions.py +17 -0
  1302. vllm/v1/engine/llm_engine.py +370 -0
  1303. vllm/v1/engine/logprobs.py +201 -0
  1304. vllm/v1/engine/output_processor.py +576 -0
  1305. vllm/v1/engine/parallel_sampling.py +133 -0
  1306. vllm/v1/engine/processor.py +545 -0
  1307. vllm/v1/engine/utils.py +860 -0
  1308. vllm/v1/executor/__init__.py +0 -0
  1309. vllm/v1/executor/abstract.py +137 -0
  1310. vllm/v1/executor/multiproc_executor.py +726 -0
  1311. vllm/v1/executor/ray_distributed_executor.py +108 -0
  1312. vllm/v1/executor/utils.py +23 -0
  1313. vllm/v1/kv_cache_interface.py +375 -0
  1314. vllm/v1/kv_offload/__init__.py +0 -0
  1315. vllm/v1/kv_offload/abstract.py +165 -0
  1316. vllm/v1/kv_offload/backend.py +96 -0
  1317. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1318. vllm/v1/kv_offload/backends/cpu.py +61 -0
  1319. vllm/v1/kv_offload/cpu.py +75 -0
  1320. vllm/v1/kv_offload/factory.py +56 -0
  1321. vllm/v1/kv_offload/lru_manager.py +132 -0
  1322. vllm/v1/kv_offload/mediums.py +39 -0
  1323. vllm/v1/kv_offload/spec.py +61 -0
  1324. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1325. vllm/v1/kv_offload/worker/cpu_gpu.py +171 -0
  1326. vllm/v1/kv_offload/worker/worker.py +142 -0
  1327. vllm/v1/metrics/__init__.py +0 -0
  1328. vllm/v1/metrics/loggers.py +741 -0
  1329. vllm/v1/metrics/prometheus.py +82 -0
  1330. vllm/v1/metrics/ray_wrappers.py +152 -0
  1331. vllm/v1/metrics/reader.py +246 -0
  1332. vllm/v1/metrics/stats.py +257 -0
  1333. vllm/v1/outputs.py +161 -0
  1334. vllm/v1/pool/__init__.py +0 -0
  1335. vllm/v1/pool/metadata.py +77 -0
  1336. vllm/v1/request.py +241 -0
  1337. vllm/v1/sample/__init__.py +0 -0
  1338. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1339. vllm/v1/sample/logits_processor/builtin.py +275 -0
  1340. vllm/v1/sample/logits_processor/interface.py +97 -0
  1341. vllm/v1/sample/logits_processor/state.py +161 -0
  1342. vllm/v1/sample/metadata.py +43 -0
  1343. vllm/v1/sample/ops/__init__.py +0 -0
  1344. vllm/v1/sample/ops/bad_words.py +39 -0
  1345. vllm/v1/sample/ops/logprobs.py +26 -0
  1346. vllm/v1/sample/ops/penalties.py +43 -0
  1347. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1348. vllm/v1/sample/rejection_sampler.py +623 -0
  1349. vllm/v1/sample/sampler.py +285 -0
  1350. vllm/v1/sample/tpu/__init__.py +0 -0
  1351. vllm/v1/sample/tpu/metadata.py +124 -0
  1352. vllm/v1/sample/tpu/sampler.py +213 -0
  1353. vllm/v1/serial_utils.py +423 -0
  1354. vllm/v1/spec_decode/__init__.py +0 -0
  1355. vllm/v1/spec_decode/eagle.py +1011 -0
  1356. vllm/v1/spec_decode/medusa.py +66 -0
  1357. vllm/v1/spec_decode/metadata.py +62 -0
  1358. vllm/v1/spec_decode/metrics.py +211 -0
  1359. vllm/v1/spec_decode/ngram_proposer.py +276 -0
  1360. vllm/v1/spec_decode/utils.py +14 -0
  1361. vllm/v1/structured_output/__init__.py +295 -0
  1362. vllm/v1/structured_output/backend_guidance.py +245 -0
  1363. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1364. vllm/v1/structured_output/backend_outlines.py +320 -0
  1365. vllm/v1/structured_output/backend_types.py +134 -0
  1366. vllm/v1/structured_output/backend_xgrammar.py +327 -0
  1367. vllm/v1/structured_output/request.py +86 -0
  1368. vllm/v1/structured_output/utils.py +454 -0
  1369. vllm/v1/utils.py +396 -0
  1370. vllm/v1/worker/__init__.py +0 -0
  1371. vllm/v1/worker/block_table.py +210 -0
  1372. vllm/v1/worker/cpu_model_runner.py +175 -0
  1373. vllm/v1/worker/cpu_worker.py +156 -0
  1374. vllm/v1/worker/gpu_input_batch.py +863 -0
  1375. vllm/v1/worker/gpu_model_runner.py +4160 -0
  1376. vllm/v1/worker/gpu_ubatch_wrapper.py +399 -0
  1377. vllm/v1/worker/gpu_worker.py +710 -0
  1378. vllm/v1/worker/kv_connector_model_runner_mixin.py +132 -0
  1379. vllm/v1/worker/lora_model_runner_mixin.py +183 -0
  1380. vllm/v1/worker/tpu_input_batch.py +587 -0
  1381. vllm/v1/worker/tpu_model_runner.py +1946 -0
  1382. vllm/v1/worker/tpu_worker.py +346 -0
  1383. vllm/v1/worker/ubatch_splitting.py +192 -0
  1384. vllm/v1/worker/ubatch_utils.py +27 -0
  1385. vllm/v1/worker/ubatching.py +224 -0
  1386. vllm/v1/worker/utils.py +344 -0
  1387. vllm/v1/worker/worker_base.py +65 -0
  1388. vllm/v1/worker/xpu_model_runner.py +57 -0
  1389. vllm/v1/worker/xpu_worker.py +179 -0
  1390. vllm/version.py +41 -0
  1391. vllm/vllm_flash_attn/.gitkeep +0 -0
  1392. vllm/worker/__init__.py +0 -0
  1393. vllm/worker/worker_base.py +279 -0
  1394. vllm_cpu-0.11.0.post2.dist-info/METADATA +348 -0
  1395. vllm_cpu-0.11.0.post2.dist-info/RECORD +1398 -0
  1396. vllm_cpu-0.11.0.post2.dist-info/WHEEL +5 -0
  1397. vllm_cpu-0.11.0.post2.dist-info/entry_points.txt +5 -0
  1398. vllm_cpu-0.11.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2195 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ from abc import abstractmethod
5
+ from collections.abc import Iterable
6
+ from contextlib import nullcontext
7
+ from enum import Enum
8
+ from typing import Callable, Literal, Optional, Union, get_args, overload
9
+
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from torch.nn.parameter import UninitializedParameter
13
+
14
+ import vllm.envs as envs
15
+ from vllm.config import get_current_vllm_config
16
+ from vllm.config.parallel import ExpertPlacementStrategy
17
+ from vllm.distributed import (get_dp_group, get_ep_group,
18
+ get_tensor_model_parallel_world_size,
19
+ tensor_model_parallel_all_reduce)
20
+ from vllm.distributed.eplb.eplb_state import EplbState
21
+ from vllm.forward_context import ForwardContext, get_forward_context
22
+ from vllm.logger import init_logger
23
+ from vllm.model_executor.custom_op import CustomOp
24
+ # yapf: disable
25
+ from vllm.model_executor.layers.fused_moe.config import (
26
+ FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEConfig, FusedMoEParallelConfig,
27
+ FusedMoEQuantConfig, biased_moe_quant_config)
28
+ from vllm.model_executor.layers.fused_moe.fused_moe import (
29
+ zero_experts_compute_triton)
30
+ # yapf: enable
31
+ from vllm.model_executor.layers.fused_moe.modular_kernel import (
32
+ FusedMoEActivationFormat, FusedMoEModularKernel,
33
+ FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
34
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
35
+ is_rocm_aiter_moe_enabled)
36
+ from vllm.model_executor.layers.fused_moe.routing_simulator import (
37
+ RoutingSimulator)
38
+ from vllm.model_executor.layers.quantization.base_config import (
39
+ QuantizationConfig, QuantizeMethodBase)
40
+ from vllm.model_executor.utils import set_weight_attrs
41
+ from vllm.platforms import current_platform
42
+ from vllm.platforms.interface import CpuArchEnum
43
+ from vllm.utils import (cdiv, direct_register_custom_op, has_deep_ep, has_pplx,
44
+ round_up)
45
+ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
46
+ from vllm.v1.worker.ubatching import dbo_current_ubatch_id
47
+
48
+ if current_platform.is_cuda_alike():
49
+ from .fused_batched_moe import BatchedTritonExperts
50
+ from .fused_moe import (TritonExperts, eplb_map_to_physical_and_record,
51
+ fused_experts)
52
+ if has_pplx():
53
+ from .pplx_prepare_finalize import (PplxPrepareAndFinalize,
54
+ pplx_hidden_dim_scale_bytes)
55
+ if has_deep_ep():
56
+ from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
57
+ from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
58
+ DeepEPLLPrepareAndFinalize)
59
+ else:
60
+ fused_experts = None # type: ignore
61
+ FusedMoEPermuteExpertsUnpermute = None # type: ignore
62
+ FusedMoEPrepareAndFinalize = None # type: ignore
63
+
64
+ def _eplb_map_to_physical_and_record(
65
+ topk_ids: torch.Tensor, expert_load_view: torch.Tensor,
66
+ logical_to_physical_map: torch.Tensor,
67
+ logical_replica_count: torch.Tensor,
68
+ indices_type: Optional[torch.dtype]) -> torch.Tensor:
69
+ # CPU fallback: no EPLB so just return as is
70
+ return topk_ids
71
+
72
+ eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
73
+
74
+ if is_rocm_aiter_moe_enabled():
75
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
76
+ rocm_aiter_grouped_topk as grouped_topk)
77
+ else:
78
+ from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
79
+ if current_platform.is_tpu():
80
+ from .moe_pallas import fused_moe as fused_moe_pallas
81
+ else:
82
+ fused_moe_pallas = None # type: ignore
83
+
84
+ logger = init_logger(__name__)
85
+
86
+
87
+ class FusedMoeWeightScaleSupported(Enum):
88
+ TENSOR = "tensor"
89
+ CHANNEL = "channel"
90
+ GROUP = "group"
91
+ BLOCK = "block"
92
+
93
+
94
+ class FusedMoEMethodBase(QuantizeMethodBase):
95
+
96
+ def __init__(self, moe: FusedMoEConfig):
97
+ super().__init__()
98
+ self.moe = moe
99
+ self.moe_quant_config: Optional[FusedMoEQuantConfig] = None
100
+ self.fused_experts: Optional[FusedMoEModularKernel] = None
101
+ self.topk_indices_dtype = None
102
+
103
+ @abstractmethod
104
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
105
+ hidden_size: int, intermediate_size_per_partition: int,
106
+ params_dtype: torch.dtype, **extra_weight_attrs):
107
+ raise NotImplementedError
108
+
109
+ def uses_weight_scale_2_pattern(self) -> bool:
110
+ """
111
+ Returns True if this quantization method uses 'weight_scale_2' pattern
112
+ for per-tensor weight scales (e.g., FP4 variants), False otherwise.
113
+
114
+ This method should be overridden by subclasses that use the
115
+ 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
116
+ """
117
+ return False
118
+
119
+ @staticmethod
120
+ def _maybe_make_prepare_finalize(
121
+ moe: FusedMoEConfig,
122
+ quant_config: Optional[FusedMoEQuantConfig],
123
+ ) -> Optional[FusedMoEPrepareAndFinalize]:
124
+ all2all_manager = get_ep_group().device_communicator.all2all_manager
125
+ assert all2all_manager is not None
126
+
127
+ prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
128
+
129
+ # TODO: could allow this now
130
+ assert not moe.use_flashinfer_cutlass_kernels, \
131
+ "Must be created in modelopt.py"
132
+
133
+ if moe.use_pplx_kernels:
134
+ assert quant_config is not None
135
+
136
+ hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
137
+ moe.max_num_tokens,
138
+ moe.hidden_dim,
139
+ moe.in_dtype,
140
+ quant_config.quant_dtype,
141
+ per_act_token_quant=quant_config.per_act_token_quant,
142
+ block_shape=quant_config.block_shape,
143
+ )
144
+
145
+ all_to_all_args = dict(
146
+ max_num_tokens=moe.max_num_tokens,
147
+ num_experts=moe.num_experts,
148
+ experts_per_token=moe.experts_per_token, # topk
149
+ rank=all2all_manager.rank,
150
+ world_size=all2all_manager.world_size,
151
+ # dp_size actually means tp_size, bug in pplx kernels
152
+ dp_size=all2all_manager.tp_group.world_size,
153
+ hidden_dim=moe.hidden_dim,
154
+ hidden_dim_bytes=hidden_dim_bytes,
155
+ hidden_dim_scale_bytes=hidden_scale_bytes,
156
+ )
157
+
158
+ num_dispatchers = (all2all_manager.world_size //
159
+ all2all_manager.tp_group.world_size)
160
+
161
+ # Intranode pplx a2a takes a group name while internode does not.
162
+ if not all2all_manager.internode:
163
+ all_to_all_args[
164
+ "group_name"] = all2all_manager.cpu_group.group_name
165
+
166
+ handle = all2all_manager.get_handle(all_to_all_args)
167
+
168
+ prepare_finalize = PplxPrepareAndFinalize(
169
+ handle,
170
+ max_num_tokens=moe.max_num_tokens,
171
+ num_local_experts=moe.num_local_experts,
172
+ num_dispatchers=num_dispatchers,
173
+ )
174
+ elif moe.use_deepep_ht_kernels:
175
+ assert moe.dp_size == all2all_manager.dp_world_size
176
+
177
+ all_to_all_args = dict()
178
+ handle = all2all_manager.get_handle(all_to_all_args)
179
+ prepare_finalize = DeepEPHTPrepareAndFinalize(
180
+ handle,
181
+ num_dispatchers=all2all_manager.world_size,
182
+ dp_size=all2all_manager.dp_world_size,
183
+ rank_expert_offset=all2all_manager.rank *
184
+ moe.num_local_experts,
185
+ )
186
+
187
+ elif moe.use_deepep_ll_kernels:
188
+ assert quant_config is not None
189
+ all_to_all_args = dict(
190
+ max_num_tokens_per_dp_rank=moe.max_num_tokens,
191
+ token_hidden_size=moe.hidden_dim,
192
+ num_ep_ranks=all2all_manager.world_size,
193
+ num_global_experts=moe.num_experts,
194
+ num_local_experts=moe.num_experts //
195
+ all2all_manager.world_size)
196
+ handle = all2all_manager.get_handle(all_to_all_args)
197
+
198
+ # Note: We may want to use FP8 dispatch just to reduce
199
+ # data movement.
200
+ use_fp8_dispatch = (
201
+ quant_config.quant_dtype == current_platform.fp8_dtype()
202
+ and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE)
203
+
204
+ prepare_finalize = DeepEPLLPrepareAndFinalize(
205
+ handle,
206
+ max_tokens_per_rank=moe.max_num_tokens,
207
+ num_dispatchers=all2all_manager.world_size,
208
+ use_fp8_dispatch=use_fp8_dispatch,
209
+ )
210
+
211
+ return prepare_finalize
212
+
213
+ def maybe_make_prepare_finalize(
214
+ self) -> Optional[FusedMoEPrepareAndFinalize]:
215
+ if self.moe.moe_parallel_config.use_all2all_kernels:
216
+ return FusedMoEMethodBase._maybe_make_prepare_finalize(
217
+ self.moe, self.moe_quant_config)
218
+ else:
219
+ return None
220
+
221
+ # Note: init_prepare_finalize should only be called by
222
+ # prepare_communication_buffer_for_model.
223
+ def init_prepare_finalize(self, layer: torch.nn.Module):
224
+ assert self.moe is not None
225
+
226
+ # We must get the quant config here so that the layer is
227
+ # completely initialized, i.e. all weights loaded and post
228
+ # processed.
229
+ self.moe_quant_config = self.get_fused_moe_quant_config(layer)
230
+
231
+ prepare_finalize = self.maybe_make_prepare_finalize()
232
+
233
+ if prepare_finalize is not None:
234
+ logger.debug("%s for %s(%s)", prepare_finalize.__class__.__name__,
235
+ self, id(self))
236
+ assert self.topk_indices_dtype is None
237
+ assert self.fused_experts is None, \
238
+ f"Attempt to override experts for {id(self)}!"
239
+ self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
240
+ experts = self.select_gemm_impl(prepare_finalize, layer)
241
+ self.fused_experts = FusedMoEModularKernel(
242
+ prepare_finalize,
243
+ experts,
244
+ layer.shared_experts,
245
+ )
246
+
247
+ def select_gemm_impl(
248
+ self,
249
+ prepare_finalize: FusedMoEPrepareAndFinalize,
250
+ layer: torch.nn.Module,
251
+ ) -> FusedMoEPermuteExpertsUnpermute:
252
+ # based on the all2all implementation, select the appropriate
253
+ # gemm implementation
254
+ raise NotImplementedError(
255
+ f"{self.__class__.__name__} must select appropriate gemm "
256
+ "implementation based on the prepare_finalize")
257
+
258
+ @abstractmethod
259
+ def get_fused_moe_quant_config(
260
+ self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
261
+ raise NotImplementedError
262
+
263
+ @abstractmethod
264
+ def apply(
265
+ self,
266
+ layer: torch.nn.Module,
267
+ x: torch.Tensor,
268
+ router_logits: torch.Tensor,
269
+ top_k: int,
270
+ renormalize: bool,
271
+ use_grouped_topk: bool = False,
272
+ topk_group: Optional[int] = None,
273
+ num_expert_group: Optional[int] = None,
274
+ global_num_experts: int = -1,
275
+ expert_map: Optional[torch.Tensor] = None,
276
+ custom_routing_function: Optional[Callable] = None,
277
+ scoring_func: str = "softmax",
278
+ routed_scaling_factor: float = 1.0,
279
+ e_score_correction_bias: Optional[torch.Tensor] = None,
280
+ apply_router_weight_on_input: bool = False,
281
+ activation: str = "silu",
282
+ enable_eplb: bool = False,
283
+ expert_load_view: Optional[torch.Tensor] = None,
284
+ logical_to_physical_map: Optional[torch.Tensor] = None,
285
+ logical_replica_count: Optional[torch.Tensor] = None,
286
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
287
+ raise NotImplementedError
288
+
289
+
290
+ @CustomOp.register("unquantized_fused_moe")
291
+ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
292
+ """MoE method without quantization."""
293
+
294
+ def __init__(self, moe: FusedMoEConfig):
295
+ super().__init__(moe)
296
+ self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
297
+ if self.rocm_aiter_moe_enabled:
298
+ from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
299
+ self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
300
+ else:
301
+ self.rocm_aiter_fused_experts = None # type: ignore
302
+
303
+ # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
304
+ self.flashinfer_cutlass_moe_enabled = (
305
+ has_flashinfer_cutlass_fused_moe()
306
+ and envs.VLLM_USE_FLASHINFER_MOE_FP16
307
+ and self.moe.moe_parallel_config.use_ep
308
+ and self.moe.moe_parallel_config.dp_size == 1
309
+ and current_platform.get_device_capability()[0] >= 9)
310
+ if self.flashinfer_cutlass_moe_enabled:
311
+ logger.info_once(
312
+ "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
313
+ )
314
+ from functools import partial
315
+
316
+ from .flashinfer_cutlass_moe import flashinfer_cutlass_moe
317
+ self.flashinfer_cutlass_moe = partial(
318
+ flashinfer_cutlass_moe,
319
+ quant_config=FUSED_MOE_UNQUANTIZED_CONFIG,
320
+ tp_rank=self.moe.moe_parallel_config.tp_rank,
321
+ tp_size=self.moe.moe_parallel_config.tp_size,
322
+ ep_rank=self.moe.moe_parallel_config.ep_rank,
323
+ ep_size=self.moe.moe_parallel_config.ep_size)
324
+ else:
325
+ if (self.moe.moe_parallel_config.use_ep
326
+ and self.moe.moe_parallel_config.dp_size == 1):
327
+ logger.info_once(
328
+ "FlashInfer CUTLASS MoE is available for EP"
329
+ " but not enabled, consider setting"
330
+ " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.")
331
+ elif self.moe.moe_parallel_config.dp_size > 1:
332
+ logger.info_once(
333
+ "FlashInfer CUTLASS MoE is currently not available for DP."
334
+ )
335
+ self.flashinfer_cutlass_moe = None # type: ignore
336
+
337
+ def maybe_make_prepare_finalize(
338
+ self) -> Optional[FusedMoEPrepareAndFinalize]:
339
+ if self.rocm_aiter_moe_enabled:
340
+ return None
341
+ else:
342
+ return super().maybe_make_prepare_finalize()
343
+
344
+ def select_gemm_impl(
345
+ self,
346
+ prepare_finalize: FusedMoEPrepareAndFinalize,
347
+ layer: torch.nn.Module,
348
+ ) -> FusedMoEPermuteExpertsUnpermute:
349
+ assert self.moe_quant_config is not None
350
+ if (prepare_finalize.activation_format ==
351
+ FusedMoEActivationFormat.BatchedExperts):
352
+ logger.debug("BatchedTritonExperts %s", self.moe)
353
+ return BatchedTritonExperts(
354
+ max_num_tokens=self.moe.max_num_tokens,
355
+ num_dispatchers=prepare_finalize.num_dispatchers(),
356
+ quant_config=self.moe_quant_config,
357
+ )
358
+ else:
359
+ logger.debug("TritonExperts %s", self.moe)
360
+ return TritonExperts(self.moe_quant_config)
361
+
362
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
363
+ hidden_size: int, intermediate_size_per_partition: int,
364
+ params_dtype: torch.dtype, **extra_weight_attrs):
365
+ # Fused gate_up_proj (column parallel)
366
+ w13_weight = torch.nn.Parameter(torch.empty(
367
+ num_experts,
368
+ 2 * intermediate_size_per_partition,
369
+ hidden_size,
370
+ dtype=params_dtype),
371
+ requires_grad=False)
372
+ layer.register_parameter("w13_weight", w13_weight)
373
+ set_weight_attrs(w13_weight, extra_weight_attrs)
374
+ if self.moe.has_bias:
375
+ w13_bias = torch.nn.Parameter(torch.zeros(
376
+ num_experts,
377
+ 2 * intermediate_size_per_partition,
378
+ dtype=params_dtype),
379
+ requires_grad=False)
380
+ layer.register_parameter("w13_bias", w13_bias)
381
+ set_weight_attrs(w13_bias, extra_weight_attrs)
382
+ # down_proj (row parallel)
383
+ w2_weight = torch.nn.Parameter(torch.empty(
384
+ num_experts,
385
+ hidden_size,
386
+ intermediate_size_per_partition,
387
+ dtype=params_dtype),
388
+ requires_grad=False)
389
+ layer.register_parameter("w2_weight", w2_weight)
390
+ set_weight_attrs(w2_weight, extra_weight_attrs)
391
+ if self.moe.has_bias:
392
+ w2_bias = torch.nn.Parameter(torch.zeros(num_experts,
393
+ hidden_size,
394
+ dtype=params_dtype),
395
+ requires_grad=False)
396
+ layer.register_parameter("w2_bias", w2_bias)
397
+ set_weight_attrs(w2_bias, extra_weight_attrs)
398
+
399
+ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
400
+ # Pad the weight tensor. This is an optimization on ROCm platform, which
401
+ # can benefit from tensors located far enough from one another in memory
402
+ if (envs.VLLM_ROCM_MOE_PADDING and current_platform.is_rocm()
403
+ and weight.stride(-1) == 1
404
+ and (weight.stride(-2) * weight.element_size()) % 512 == 0):
405
+ num_pad = 256 // weight.element_size()
406
+ weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
407
+ torch.cuda.empty_cache()
408
+
409
+ return weight
410
+
411
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
412
+ super().process_weights_after_loading(layer)
413
+
414
+ # Padding the weight for better performance on ROCm
415
+ layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
416
+ layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
417
+ # Lazy import to avoid importing triton.
418
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
419
+ shuffle_weights)
420
+
421
+ if self.rocm_aiter_moe_enabled:
422
+ shuffled_w13, shuffled_w2 = shuffle_weights(
423
+ layer.w13_weight.data, layer.w2_weight.data)
424
+
425
+ layer.w13_weight.data = shuffled_w13
426
+ layer.w2_weight.data = shuffled_w2
427
+
428
+ if self.flashinfer_cutlass_moe_enabled:
429
+ # Swap halves to arrange as [w3; w1] (kernel expectation)
430
+ w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1)
431
+ w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
432
+ layer.w13_weight.data = w13_weight_swapped.contiguous()
433
+
434
+ if current_platform.is_xpu():
435
+ import intel_extension_for_pytorch as ipex
436
+ layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
437
+ layer.w13_weight,
438
+ layer.w2_weight,
439
+ use_prepack=True,
440
+ )
441
+ elif current_platform.is_cpu():
442
+ from vllm.model_executor.layers.fused_moe import cpu_fused_moe
443
+ if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
444
+ from vllm.model_executor.layers.utils import (
445
+ check_cpu_sgl_kernel)
446
+ dtype_w13 = layer.w13_weight.dtype
447
+ _, n_w13, k_w13 = layer.w13_weight.size()
448
+ dtype_w2 = layer.w2_weight.dtype
449
+ _, n_w2, k_w2 = layer.w2_weight.size()
450
+ if (envs.VLLM_CPU_SGL_KERNEL
451
+ and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
452
+ and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)):
453
+ packed_w13_weight = torch.ops._C.convert_weight_packed(
454
+ layer.w13_weight)
455
+ assert packed_w13_weight.size() == layer.w13_weight.size()
456
+ layer.w13_weight.copy_(packed_w13_weight)
457
+ del packed_w13_weight
458
+ packed_w2_weight = torch.ops._C.convert_weight_packed(
459
+ layer.w2_weight)
460
+ assert packed_w2_weight.size() == layer.w2_weight.size()
461
+ layer.w2_weight.copy_(packed_w2_weight)
462
+ layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
463
+ else:
464
+ layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
465
+ else:
466
+ layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
467
+
468
+ def apply(
469
+ self,
470
+ layer: torch.nn.Module,
471
+ x: torch.Tensor,
472
+ router_logits: torch.Tensor,
473
+ top_k: int,
474
+ renormalize: bool,
475
+ use_grouped_topk: bool = False,
476
+ topk_group: Optional[int] = None,
477
+ num_expert_group: Optional[int] = None,
478
+ global_num_experts: int = -1,
479
+ expert_map: Optional[torch.Tensor] = None,
480
+ custom_routing_function: Optional[Callable] = None,
481
+ scoring_func: str = "softmax",
482
+ routed_scaling_factor: float = 1.0,
483
+ e_score_correction_bias: Optional[torch.Tensor] = None,
484
+ apply_router_weight_on_input: bool = False,
485
+ activation: str = "silu",
486
+ enable_eplb: bool = False,
487
+ expert_load_view: Optional[torch.Tensor] = None,
488
+ logical_to_physical_map: Optional[torch.Tensor] = None,
489
+ logical_replica_count: Optional[torch.Tensor] = None,
490
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
491
+ if enable_eplb:
492
+ assert expert_load_view is not None
493
+ assert logical_to_physical_map is not None
494
+ assert logical_replica_count is not None
495
+ assert isinstance(layer, FusedMoE)
496
+
497
+ return self.forward(
498
+ x=x,
499
+ layer=layer,
500
+ router_logits=router_logits,
501
+ top_k=top_k,
502
+ renormalize=renormalize,
503
+ use_grouped_topk=use_grouped_topk,
504
+ topk_group=topk_group,
505
+ num_expert_group=num_expert_group,
506
+ global_num_experts=global_num_experts,
507
+ expert_map=expert_map,
508
+ custom_routing_function=custom_routing_function,
509
+ scoring_func=scoring_func,
510
+ routed_scaling_factor=routed_scaling_factor,
511
+ e_score_correction_bias=e_score_correction_bias,
512
+ activation=activation,
513
+ apply_router_weight_on_input=apply_router_weight_on_input,
514
+ enable_eplb=enable_eplb,
515
+ expert_load_view=expert_load_view,
516
+ logical_to_physical_map=logical_to_physical_map,
517
+ logical_replica_count=logical_replica_count,
518
+ )
519
+
520
+ def get_fused_moe_quant_config(
521
+ self, layer: torch.nn.Module) -> Optional[FusedMoEQuantConfig]:
522
+ if self.moe.has_bias:
523
+ return biased_moe_quant_config(
524
+ layer.w13_bias,
525
+ layer.w2_bias,
526
+ )
527
+ else:
528
+ return FUSED_MOE_UNQUANTIZED_CONFIG
529
+
530
+ def forward_cuda(
531
+ self,
532
+ layer: torch.nn.Module,
533
+ x: torch.Tensor,
534
+ use_grouped_topk: bool,
535
+ top_k: int,
536
+ router_logits: torch.Tensor,
537
+ renormalize: bool,
538
+ topk_group: Optional[int] = None,
539
+ num_expert_group: Optional[int] = None,
540
+ global_num_experts: int = -1,
541
+ expert_map: Optional[torch.Tensor] = None,
542
+ custom_routing_function: Optional[Callable] = None,
543
+ scoring_func: str = "softmax",
544
+ routed_scaling_factor: float = 1.0,
545
+ e_score_correction_bias: Optional[torch.Tensor] = None,
546
+ apply_router_weight_on_input: bool = False,
547
+ activation: str = "silu",
548
+ enable_eplb: bool = False,
549
+ expert_load_view: Optional[torch.Tensor] = None,
550
+ logical_to_physical_map: Optional[torch.Tensor] = None,
551
+ logical_replica_count: Optional[torch.Tensor] = None,
552
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
553
+
554
+ zero_expert_num = getattr(layer, 'zero_expert_num', 0)
555
+ zero_expert_type = getattr(layer, 'zero_expert_type', None)
556
+
557
+ topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts(
558
+ hidden_states=x,
559
+ router_logits=router_logits,
560
+ use_grouped_topk=use_grouped_topk,
561
+ top_k=top_k,
562
+ renormalize=renormalize,
563
+ topk_group=topk_group,
564
+ num_expert_group=num_expert_group,
565
+ custom_routing_function=custom_routing_function,
566
+ scoring_func=scoring_func,
567
+ routed_scaling_factor=routed_scaling_factor,
568
+ e_score_correction_bias=e_score_correction_bias,
569
+ indices_type=self.topk_indices_dtype,
570
+ enable_eplb=enable_eplb,
571
+ expert_map=expert_map,
572
+ expert_load_view=expert_load_view,
573
+ logical_to_physical_map=logical_to_physical_map,
574
+ logical_replica_count=logical_replica_count,
575
+ global_num_experts=global_num_experts,
576
+ zero_expert_num=zero_expert_num,
577
+ zero_expert_type=zero_expert_type)
578
+
579
+ if self.rocm_aiter_moe_enabled:
580
+ assert self.fused_experts is None
581
+ result = self.rocm_aiter_fused_experts(
582
+ hidden_states=x,
583
+ w1=layer.w13_weight,
584
+ w2=layer.w2_weight,
585
+ topk_weights=topk_weights,
586
+ topk_ids=topk_ids,
587
+ expert_map=expert_map,
588
+ activation=activation,
589
+ apply_router_weight_on_input=apply_router_weight_on_input)
590
+ elif self.flashinfer_cutlass_moe_enabled:
591
+ return self.flashinfer_cutlass_moe(
592
+ hidden_states=x,
593
+ w1=layer.w13_weight,
594
+ w2=layer.w2_weight,
595
+ topk_weights=topk_weights,
596
+ topk_ids=topk_ids,
597
+ activation=activation,
598
+ apply_router_weight_on_input=apply_router_weight_on_input)
599
+ elif self.fused_experts is not None:
600
+ if self.moe.has_bias:
601
+ raise ValueError(
602
+ "FusedMoEModularKernel does not support bias.")
603
+ result = self.fused_experts(
604
+ hidden_states=x,
605
+ w1=layer.w13_weight,
606
+ w2=layer.w2_weight,
607
+ topk_weights=topk_weights,
608
+ topk_ids=topk_ids,
609
+ inplace=True,
610
+ activation=activation,
611
+ apply_router_weight_on_input=apply_router_weight_on_input,
612
+ global_num_experts=global_num_experts,
613
+ expert_map=expert_map,
614
+ )
615
+ else:
616
+ assert fused_experts is not None
617
+ result = fused_experts(
618
+ hidden_states=x,
619
+ w1=layer.w13_weight,
620
+ w2=layer.w2_weight,
621
+ topk_weights=topk_weights,
622
+ topk_ids=topk_ids,
623
+ inplace=True,
624
+ activation=activation,
625
+ quant_config=self.moe_quant_config,
626
+ apply_router_weight_on_input=apply_router_weight_on_input,
627
+ global_num_experts=global_num_experts,
628
+ expert_map=expert_map,
629
+ )
630
+
631
+ if zero_expert_num != 0 and zero_expert_type is not None:
632
+ assert not isinstance(result, tuple), \
633
+ "Shared + zero experts are mutually exclusive not yet supported"
634
+ return result, zero_expert_result
635
+ else:
636
+ return result
637
+
638
+ def forward_cpu(
639
+ self,
640
+ layer: torch.nn.Module,
641
+ x: torch.Tensor,
642
+ use_grouped_topk: bool,
643
+ top_k: int,
644
+ router_logits: torch.Tensor,
645
+ renormalize: bool,
646
+ topk_group: Optional[int] = None,
647
+ num_expert_group: Optional[int] = None,
648
+ global_num_experts: int = -1,
649
+ expert_map: Optional[torch.Tensor] = None,
650
+ custom_routing_function: Optional[Callable] = None,
651
+ scoring_func: str = "softmax",
652
+ routed_scaling_factor: float = 1.0,
653
+ e_score_correction_bias: Optional[torch.Tensor] = None,
654
+ apply_router_weight_on_input: bool = False,
655
+ activation: str = "silu",
656
+ enable_eplb: bool = False,
657
+ expert_load_view: Optional[torch.Tensor] = None,
658
+ logical_to_physical_map: Optional[torch.Tensor] = None,
659
+ logical_replica_count: Optional[torch.Tensor] = None,
660
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
661
+ if enable_eplb is not False or expert_load_view is not None or \
662
+ logical_to_physical_map is not None or \
663
+ logical_replica_count is not None:
664
+ raise NotImplementedError("Expert load balancing is not supported "
665
+ "for CPU.")
666
+ return layer.cpu_fused_moe(
667
+ layer,
668
+ x,
669
+ use_grouped_topk,
670
+ top_k,
671
+ router_logits,
672
+ renormalize,
673
+ topk_group,
674
+ num_expert_group,
675
+ global_num_experts,
676
+ expert_map,
677
+ custom_routing_function,
678
+ scoring_func,
679
+ routed_scaling_factor,
680
+ e_score_correction_bias,
681
+ apply_router_weight_on_input,
682
+ activation,
683
+ )
684
+
685
+ def forward_xpu(
686
+ self,
687
+ layer: torch.nn.Module,
688
+ x: torch.Tensor,
689
+ use_grouped_topk: bool,
690
+ top_k: int,
691
+ router_logits: torch.Tensor,
692
+ renormalize: bool,
693
+ topk_group: Optional[int] = None,
694
+ num_expert_group: Optional[int] = None,
695
+ global_num_experts: int = -1,
696
+ expert_map: Optional[torch.Tensor] = None,
697
+ custom_routing_function: Optional[Callable] = None,
698
+ scoring_func: str = "softmax",
699
+ routed_scaling_factor: float = 1.0,
700
+ e_score_correction_bias: Optional[torch.Tensor] = None,
701
+ apply_router_weight_on_input: bool = False,
702
+ activation: str = "silu",
703
+ enable_eplb: bool = False,
704
+ expert_load_view: Optional[torch.Tensor] = None,
705
+ logical_to_physical_map: Optional[torch.Tensor] = None,
706
+ logical_replica_count: Optional[torch.Tensor] = None,
707
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
708
+ if enable_eplb is not False or expert_load_view is not None or \
709
+ logical_to_physical_map is not None or \
710
+ logical_replica_count is not None:
711
+ raise NotImplementedError("Expert load balancing is not supported "
712
+ "for XPU.")
713
+ assert custom_routing_function is None
714
+ return layer.ipex_fusion(
715
+ x,
716
+ use_grouped_topk,
717
+ top_k,
718
+ router_logits,
719
+ renormalize,
720
+ topk_group,
721
+ num_expert_group,
722
+ )
723
+
724
+ def forward_tpu(
725
+ self,
726
+ layer: torch.nn.Module,
727
+ x: torch.Tensor,
728
+ use_grouped_topk: bool,
729
+ top_k: int,
730
+ router_logits: torch.Tensor,
731
+ renormalize: bool,
732
+ topk_group: Optional[int] = None,
733
+ num_expert_group: Optional[int] = None,
734
+ global_num_experts: int = -1,
735
+ expert_map: Optional[torch.Tensor] = None,
736
+ custom_routing_function: Optional[Callable] = None,
737
+ scoring_func: str = "softmax",
738
+ routed_scaling_factor: float = 1.0,
739
+ e_score_correction_bias: Optional[torch.Tensor] = None,
740
+ apply_router_weight_on_input: bool = False,
741
+ activation: str = "silu",
742
+ enable_eplb: bool = False,
743
+ expert_load_view: Optional[torch.Tensor] = None,
744
+ logical_to_physical_map: Optional[torch.Tensor] = None,
745
+ logical_replica_count: Optional[torch.Tensor] = None,
746
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
747
+ assert not use_grouped_topk
748
+ assert num_expert_group is None
749
+ assert topk_group is None
750
+ assert custom_routing_function is None
751
+ assert apply_router_weight_on_input is False
752
+ if scoring_func != "softmax":
753
+ raise NotImplementedError(
754
+ "Only softmax scoring function is supported for TPU.")
755
+ if e_score_correction_bias is not None:
756
+ raise NotImplementedError(
757
+ "Expert score correction bias is not supported for TPU.")
758
+ assert activation == "silu", f"{activation} is not supported for TPU."
759
+ assert routed_scaling_factor == 1.0, \
760
+ f"routed_scaling_factor {routed_scaling_factor} is not supported " \
761
+ f"for TPU."
762
+ if enable_eplb is not False or expert_load_view is not None or \
763
+ logical_to_physical_map is not None or \
764
+ logical_replica_count is not None:
765
+ raise NotImplementedError("Expert load balancing is not supported "
766
+ "for TPU.")
767
+ return fused_moe_pallas(hidden_states=x,
768
+ w1=layer.w13_weight,
769
+ w2=layer.w2_weight,
770
+ topk=top_k,
771
+ gating_output=router_logits,
772
+ global_num_experts=global_num_experts,
773
+ expert_map=expert_map,
774
+ renormalize=renormalize)
775
+
776
+ if current_platform.is_tpu():
777
+ forward_native = forward_tpu
778
+ elif current_platform.is_cpu():
779
+ forward_native = forward_cpu
780
+ elif current_platform.is_xpu():
781
+ forward_native = forward_xpu
782
+ else:
783
+ forward_native = forward_cuda
784
+
785
+
786
+ def determine_expert_map(
787
+ ep_size: int,
788
+ ep_rank: int,
789
+ global_num_experts: int,
790
+ expert_placement_strategy: ExpertPlacementStrategy = "linear",
791
+ ) -> tuple[int, Optional[torch.Tensor]]:
792
+ """
793
+ Calculates how many experts should be assigned to each rank for EP and
794
+ creates a mapping from global to local expert index. Experts are
795
+ distributed evenly across ranks. Any remaining are assigned to the
796
+ last rank.
797
+
798
+ Args:
799
+ ep_size: The size of the expert parallel group
800
+ ep_rank: The rank of the current process in the expert parallel
801
+ group
802
+ global_num_experts: The total number of experts in the model.
803
+ expert_placement_strategy: The expert placement strategy.
804
+
805
+ Returns:
806
+ tuple[int, Optional[torch.Tensor]]: A tuple containing:
807
+ - local_num_experts (int): The number of experts assigned
808
+ to the current rank.
809
+ - expert_map (Optional[torch.Tensor]): A tensor of shape
810
+ (global_num_experts,) mapping from global to local index.
811
+ Contains -1 for experts not assigned to the current rank.
812
+ Returns None if ep_size is 1.
813
+ """
814
+ assert ep_size > 0
815
+ if ep_size == 1:
816
+ return (global_num_experts, None)
817
+
818
+ # Distribute experts as evenly as possible to each rank.
819
+ base_experts = global_num_experts // ep_size
820
+ remainder = global_num_experts % ep_size
821
+ if ep_rank < remainder:
822
+ local_num_experts = base_experts + 1
823
+ else:
824
+ local_num_experts = base_experts
825
+
826
+ # Create a tensor of size num_experts filled with -1
827
+ expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
828
+ # Create an expert map for the local experts
829
+ if expert_placement_strategy == "linear":
830
+ start_idx = ep_rank * base_experts + min(ep_rank, remainder)
831
+ expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
832
+ 0, local_num_experts, dtype=torch.int32)
833
+ elif expert_placement_strategy == "round_robin":
834
+ local_log_experts = torch.arange(ep_rank,
835
+ global_num_experts,
836
+ ep_size,
837
+ dtype=torch.int32)
838
+
839
+ expert_map[local_log_experts] = torch.arange(0,
840
+ local_num_experts,
841
+ dtype=torch.int32)
842
+ else:
843
+ raise ValueError("Unsupported expert placement strategy "
844
+ f"'{expert_placement_strategy}', expected one of "
845
+ f"{get_args(ExpertPlacementStrategy)}")
846
+ return (local_num_experts, expert_map)
847
+
848
+
849
+ def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
850
+ """
851
+ Compresses the expert map by removing any -1 entries.
852
+
853
+ Args:
854
+ expert_map (torch.Tensor): A tensor of shape (global_num_experts,)
855
+ mapping from global to local index. Contains -1 for experts not
856
+ assigned to the current rank.
857
+
858
+ Returns:
859
+ str: A string mapping from local to global index.
860
+ Using str to support hashing for logging once only.
861
+ """
862
+ global_indices = torch.where(expert_map != -1)[0]
863
+ local_indices = expert_map[global_indices]
864
+ return ", ".join(
865
+ f"{local_index.item()}->{global_index.item()}"
866
+ for local_index, global_index in zip(local_indices, global_indices))
867
+
868
+
869
+ def maybe_roundup_hidden_size(
870
+ hidden_size: int, act_dtype: torch.dtype,
871
+ quant_config: Optional[QuantizationConfig],
872
+ moe_parallel_config: FusedMoEParallelConfig) -> int:
873
+ """
874
+ Given layer hidden size and MoE configurations, round up hidden_size
875
+ if necessary.
876
+
877
+ Args:
878
+ hidden_size: Layer hidden-size
879
+ act_dtype: Data type of the layer activations.
880
+ quant_config: Fused MoE quantization configuration.
881
+ moe_parallel_config: Fused MoE parallelization strategy configuration.
882
+
883
+ Return:
884
+ Rounded up hidden_size if rounding up is required based on the configs.
885
+ Original hidden size otherwise.
886
+ """
887
+
888
+ if (moe_parallel_config.use_deepep_ht_kernels):
889
+ hidden_size = (
890
+ DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size(
891
+ hidden_size, act_dtype))
892
+
893
+ # we are padding globally so EP buffer allocation works
894
+ if quant_config and quant_config.get_name() == "mxfp4":
895
+
896
+ from vllm.model_executor.layers.quantization.mxfp4 import (
897
+ Mxfp4Backend, get_mxfp4_backend)
898
+ current_mxfp4_backend = get_mxfp4_backend()
899
+ if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
900
+ or current_mxfp4_backend
901
+ == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS):
902
+ hidden_size = round_up(hidden_size, 128)
903
+ elif (current_platform.is_rocm() or current_mxfp4_backend
904
+ == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
905
+ or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
906
+ hidden_size = round_up(hidden_size, 256)
907
+
908
+ return hidden_size
909
+
910
+
911
+ @CustomOp.register("fused_moe")
912
+ class FusedMoE(CustomOp):
913
+ """FusedMoE layer for MoE models.
914
+
915
+ This layer contains both MergedColumnParallel weights (gate_up_proj /
916
+ w13) and RowParallelLinear weights (down_proj/ w2).
917
+
918
+ Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
919
+ copy that naming convention here and handle any remapping in the
920
+ load_weights function in each model implementation.
921
+
922
+ Args:
923
+ num_experts: Number of experts in the model
924
+ top_k: Number of experts selected for each token
925
+ hidden_size: Input hidden state size of the transformer
926
+ intermediate_size: Intermediate size of the experts
927
+ params_dtype: Data type for the parameters.
928
+ reduce_results: Whether to all all_reduce on the output of the layer
929
+ renormalize: Whether to renormalize the logits in the fused_moe kernel
930
+ quant_config: Quantization configure.
931
+ enable_eplb: Whether to enable expert parallelism load balancer.
932
+ """
933
+
934
+ def __init__(
935
+ self,
936
+ num_experts: int, # Global number of experts
937
+ top_k: int,
938
+ hidden_size: int,
939
+ intermediate_size: int,
940
+ params_dtype: Optional[torch.dtype] = None,
941
+ reduce_results: bool = False,
942
+ renormalize: bool = True,
943
+ use_grouped_topk: bool = False,
944
+ num_expert_group: Optional[int] = None,
945
+ topk_group: Optional[int] = None,
946
+ quant_config: Optional[QuantizationConfig] = None,
947
+ tp_size: Optional[int] = None,
948
+ ep_size: Optional[int] = None,
949
+ dp_size: Optional[int] = None,
950
+ prefix: str = "",
951
+ custom_routing_function: Optional[Callable] = None,
952
+ scoring_func: str = "softmax",
953
+ routed_scaling_factor: float = 1.0,
954
+ e_score_correction_bias: Optional[torch.Tensor] = None,
955
+ apply_router_weight_on_input: bool = False,
956
+ activation: str = "silu",
957
+ enable_eplb: bool = False,
958
+ num_redundant_experts: int = 0,
959
+ has_bias: bool = False,
960
+ is_sequence_parallel=False,
961
+ zero_expert_num: Optional[int] = 0,
962
+ zero_expert_type: Optional[str] = None,
963
+ ):
964
+ super().__init__()
965
+ if params_dtype is None:
966
+ params_dtype = torch.get_default_dtype()
967
+ self.params_dtype = params_dtype
968
+
969
+ vllm_config = get_current_vllm_config()
970
+
971
+ # FIXME (varun): We should have a better way of inferring the activation
972
+ # datatype. This works for now as the tensor datatype entering the MoE
973
+ # operation is typically unquantized (i.e. float16/bfloat16).
974
+ if vllm_config.model_config is not None:
975
+ moe_in_dtype = vllm_config.model_config.dtype
976
+ else:
977
+ # TODO (bnell): This is a hack to get test_mixtral_moe to work
978
+ # since model_config is not set in the pytest test.
979
+ moe_in_dtype = params_dtype
980
+
981
+ tp_size_ = (tp_size if tp_size is not None else
982
+ get_tensor_model_parallel_world_size())
983
+ dp_size_ = (dp_size
984
+ if dp_size is not None else get_dp_group().world_size)
985
+
986
+ self.is_sequence_parallel = is_sequence_parallel
987
+ self.sp_size = tp_size_ if is_sequence_parallel else 1
988
+
989
+ self.moe_parallel_config: FusedMoEParallelConfig = (
990
+ FusedMoEParallelConfig.make(
991
+ tp_size_=tp_size_,
992
+ dp_size_=dp_size_,
993
+ vllm_parallel_config=vllm_config.parallel_config))
994
+
995
+ self.global_num_experts = num_experts + num_redundant_experts
996
+ self.zero_expert_num = zero_expert_num
997
+ self.zero_expert_type = zero_expert_type
998
+
999
+ # Round up hidden size if needed.
1000
+ hidden_size = maybe_roundup_hidden_size(hidden_size, moe_in_dtype,
1001
+ quant_config,
1002
+ self.moe_parallel_config)
1003
+
1004
+ # For smuggling this layer into the fused moe custom op
1005
+ compilation_config = vllm_config.compilation_config
1006
+ if prefix in compilation_config.static_forward_context:
1007
+ raise ValueError("Duplicate layer name: {}".format(prefix))
1008
+ compilation_config.static_forward_context[prefix] = self
1009
+ self.layer_name = prefix
1010
+
1011
+ self.enable_eplb = enable_eplb
1012
+ self.expert_load_view: Optional[torch.Tensor] = None
1013
+ self.logical_to_physical_map: Optional[torch.Tensor] = None
1014
+ self.logical_replica_count: Optional[torch.Tensor] = None
1015
+
1016
+ # Determine expert maps
1017
+ if self.use_ep:
1018
+ if self.enable_eplb:
1019
+ assert self.global_num_experts % self.ep_size == 0, \
1020
+ "EPLB currently only supports even distribution of " \
1021
+ "experts across ranks."
1022
+ else:
1023
+ assert num_redundant_experts == 0, \
1024
+ "Redundant experts are only supported with EPLB."
1025
+
1026
+ expert_placement_strategy = (
1027
+ vllm_config.parallel_config.expert_placement_strategy)
1028
+ if expert_placement_strategy == "round_robin":
1029
+ # TODO(Bruce): will support round robin expert placement with
1030
+ # EPLB enabled in the future.
1031
+ round_robin_supported = ((num_expert_group is not None
1032
+ and num_expert_group > 1)
1033
+ and num_redundant_experts == 0
1034
+ and not self.enable_eplb)
1035
+
1036
+ if not round_robin_supported:
1037
+ logger.warning(
1038
+ "Round-robin expert placement is only supported for "
1039
+ "models with multiple expert groups and no redundant "
1040
+ "experts. Falling back to linear expert placement.")
1041
+ expert_placement_strategy = "linear"
1042
+
1043
+ self.expert_map: Optional[torch.Tensor]
1044
+ local_num_experts, expert_map = determine_expert_map(
1045
+ ep_size=self.ep_size,
1046
+ ep_rank=self.ep_rank,
1047
+ global_num_experts=self.global_num_experts,
1048
+ expert_placement_strategy=expert_placement_strategy,
1049
+ )
1050
+ self.local_num_experts = local_num_experts
1051
+ self.register_buffer("expert_map", expert_map)
1052
+ logger.info_once(
1053
+ "[EP Rank %s/%s] Expert parallelism is enabled. Expert "
1054
+ "placement strategy: %s. Local/global"
1055
+ " number of experts: %s/%s. Experts local to global index map:"
1056
+ " %s.", self.ep_rank, self.ep_size, expert_placement_strategy,
1057
+ self.local_num_experts, self.global_num_experts,
1058
+ get_compressed_expert_map(self.expert_map))
1059
+ else:
1060
+ self.local_num_experts, self.expert_map = (self.global_num_experts,
1061
+ None)
1062
+
1063
+ self.top_k = top_k
1064
+
1065
+ assert intermediate_size % self.tp_size == 0
1066
+ self.hidden_size = hidden_size
1067
+ self.intermediate_size_per_partition = intermediate_size // self.tp_size
1068
+ self.reduce_results = reduce_results
1069
+ self.renormalize = renormalize
1070
+ self.use_grouped_topk = use_grouped_topk
1071
+ if self.use_grouped_topk:
1072
+ assert num_expert_group is not None and topk_group is not None
1073
+ self.num_expert_group = num_expert_group
1074
+ self.topk_group = topk_group
1075
+ self.custom_routing_function = custom_routing_function
1076
+ self.scoring_func = scoring_func
1077
+ self.routed_scaling_factor = routed_scaling_factor
1078
+ self.e_score_correction_bias = e_score_correction_bias
1079
+ self.apply_router_weight_on_input = apply_router_weight_on_input
1080
+ self.activation = activation
1081
+
1082
+ if self.scoring_func != "softmax" and not self.use_grouped_topk:
1083
+ raise ValueError("Only softmax scoring function is supported for "
1084
+ "non-grouped topk.")
1085
+
1086
+ moe = FusedMoEConfig(
1087
+ num_experts=self.global_num_experts,
1088
+ experts_per_token=top_k,
1089
+ hidden_dim=hidden_size,
1090
+ num_local_experts=self.local_num_experts,
1091
+ moe_parallel_config=self.moe_parallel_config,
1092
+ in_dtype=moe_in_dtype,
1093
+ max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
1094
+ has_bias=has_bias,
1095
+ )
1096
+ self.moe_config = moe
1097
+ self.moe_quant_config: Optional[FusedMoEQuantConfig] = None
1098
+ self.quant_config = quant_config
1099
+
1100
+ # Note: get_quant_method will look at the layer's local_num_experts
1101
+ # for heuristic purposes, so it must be initialized first.
1102
+ quant_method: Optional[QuantizeMethodBase] = None
1103
+ quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
1104
+ else quant_config.get_quant_method(self, prefix))
1105
+
1106
+ assert quant_method is not None
1107
+ assert isinstance(quant_method, FusedMoEMethodBase)
1108
+ self.quant_method = quant_method
1109
+
1110
+ if self.enable_eplb:
1111
+ from vllm.model_executor.layers.quantization.fp8 import (
1112
+ Fp8MoEMethod)
1113
+ if not isinstance(quant_method,
1114
+ (Fp8MoEMethod, UnquantizedFusedMoEMethod)):
1115
+ # TODO: Add support for additional quantization methods.
1116
+ # The implementation for other quantization methods does not
1117
+ # contain essential differences, but the current quant API
1118
+ # design causes duplicated work when extending to new
1119
+ # quantization methods, so I'm leaving it for now.
1120
+ # If you plan to add support for more quantization methods,
1121
+ # please refer to the implementation in `Fp8MoEMethod`.
1122
+ raise NotImplementedError("EPLB is only supported for FP8 "
1123
+ "quantization for now.")
1124
+
1125
+ moe_quant_params = {
1126
+ "num_experts": self.local_num_experts,
1127
+ "hidden_size": hidden_size,
1128
+ "intermediate_size_per_partition":
1129
+ self.intermediate_size_per_partition,
1130
+ "params_dtype": params_dtype,
1131
+ "weight_loader": self.weight_loader,
1132
+ }
1133
+ # need full intermediate size pre-sharding for WNA16 act order
1134
+ if (self.quant_method.__class__.__name__
1135
+ in ("GPTQMarlinMoEMethod",
1136
+ "CompressedTensorsWNA16MarlinMoEMethod",
1137
+ "CompressedTensorsWNA16MoEMethod")):
1138
+ moe_quant_params["intermediate_size_full"] = intermediate_size
1139
+
1140
+ self.quant_method.create_weights(layer=self, **moe_quant_params)
1141
+
1142
+ # Chunked all2all staging tensor
1143
+ self.batched_hidden_states: Optional[torch.Tensor] = None
1144
+ self.batched_router_logits: Optional[torch.Tensor] = None
1145
+
1146
+ # TODO(bnell): flashinfer uses non-batched format.
1147
+ # Does it really need a batched buffer?
1148
+ if (self.moe_parallel_config.use_pplx_kernels
1149
+ or self.moe_parallel_config.use_deepep_ll_kernels
1150
+ or self.moe_config.use_flashinfer_cutlass_kernels):
1151
+ if vllm_config.parallel_config.enable_dbo:
1152
+ self.batched_hidden_states = torch.zeros(
1153
+ (2, moe.max_num_tokens, self.hidden_size),
1154
+ dtype=moe.in_dtype,
1155
+ device=torch.cuda.current_device())
1156
+
1157
+ # Note here we use `num_experts` which is logical expert count
1158
+ self.batched_router_logits = torch.zeros(
1159
+ (2, moe.max_num_tokens, num_experts),
1160
+ dtype=moe.in_dtype,
1161
+ device=torch.cuda.current_device())
1162
+ else:
1163
+ self.batched_hidden_states = torch.zeros(
1164
+ (moe.max_num_tokens, self.hidden_size),
1165
+ dtype=moe.in_dtype,
1166
+ device=torch.cuda.current_device())
1167
+
1168
+ # Note here we use `num_experts` which is logical expert count
1169
+ self.batched_router_logits = torch.zeros(
1170
+ (moe.max_num_tokens, num_experts),
1171
+ dtype=moe.in_dtype,
1172
+ device=torch.cuda.current_device())
1173
+
1174
+ @property
1175
+ def shared_experts(self) -> Optional[torch.nn.Module]:
1176
+ return None
1177
+
1178
+ @property
1179
+ def tp_size(self):
1180
+ return self.moe_parallel_config.tp_size
1181
+
1182
+ @property
1183
+ def dp_size(self):
1184
+ return self.moe_parallel_config.dp_size
1185
+
1186
+ @property
1187
+ def ep_size(self):
1188
+ return self.moe_parallel_config.ep_size
1189
+
1190
+ @property
1191
+ def tp_rank(self):
1192
+ return self.moe_parallel_config.tp_rank
1193
+
1194
+ @property
1195
+ def dp_rank(self):
1196
+ return self.moe_parallel_config.dp_rank
1197
+
1198
+ @property
1199
+ def ep_rank(self):
1200
+ return self.moe_parallel_config.ep_rank
1201
+
1202
+ @property
1203
+ def use_ep(self):
1204
+ return self.moe_parallel_config.use_ep
1205
+
1206
+ @property
1207
+ def use_pplx_kernels(self):
1208
+ return self.moe_parallel_config.use_pplx_kernels
1209
+
1210
+ @property
1211
+ def use_deepep_ht_kernels(self):
1212
+ return self.moe_parallel_config.use_deepep_ht_kernels
1213
+
1214
+ @property
1215
+ def use_deepep_ll_kernels(self):
1216
+ return self.moe_parallel_config.use_deepep_ll_kernels
1217
+
1218
+ @property
1219
+ def use_flashinfer_cutlass_kernels(self):
1220
+ return (self.moe_quant_config is not None
1221
+ and self.moe_quant_config.quant_dtype == "nvfp4"
1222
+ and self.moe_config.use_flashinfer_cutlass_kernels)
1223
+
1224
+ def update_expert_map(self):
1225
+ # ep_size and ep_rank should already be updated
1226
+ assert self.expert_map is not None
1227
+ with self.expert_map.device:
1228
+ local_num_experts, expert_map = determine_expert_map(
1229
+ ep_size=self.ep_size,
1230
+ ep_rank=self.ep_rank,
1231
+ global_num_experts=self.global_num_experts)
1232
+ self.local_num_experts = local_num_experts
1233
+ self.register_buffer("expert_map", expert_map)
1234
+
1235
+ def _load_per_tensor_weight_scale(self, shard_id: str,
1236
+ param: torch.nn.Parameter,
1237
+ loaded_weight: torch.Tensor,
1238
+ expert_id: int):
1239
+ param_data = param.data
1240
+ # for per tensor weight quantization
1241
+ if shard_id in ("w1", "w3"):
1242
+ # We have to keep the weight scales of w1 and w3 because
1243
+ # we need to re-quantize w1/w3 weights after weight loading.
1244
+ idx = 0 if shard_id == "w1" else 1
1245
+ param_data[expert_id][idx] = loaded_weight
1246
+ # If we are in the row parallel case (down_proj)
1247
+ elif shard_id == "w2":
1248
+ param_data[expert_id] = loaded_weight
1249
+
1250
+ def _load_combined_w13_weight_scale(self, shard_dim: int,
1251
+ loaded_weight: torch.Tensor,
1252
+ param: torch.Tensor, tp_rank: int):
1253
+ """
1254
+ Load w13 weight scales assuming that w1 weight scales and w3 weight
1255
+ scales are stored in the same loaded_weight tensor.
1256
+ """
1257
+ shard_size = param.shape[shard_dim]
1258
+ loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
1259
+ shard_size)
1260
+ param.copy_(loaded_weight)
1261
+
1262
+ def _load_model_weight_or_group_weight_scale(self,
1263
+ shard_dim: int,
1264
+ expert_data: torch.Tensor,
1265
+ shard_id: str,
1266
+ loaded_weight: torch.Tensor,
1267
+ tp_rank: int,
1268
+ load_full_w2: bool = False):
1269
+ """
1270
+ Load grouped weight scales for group quantization or model weights
1271
+ :param shard_dim: dimension to shard
1272
+ :param expert_data: parameter for a particular expert
1273
+ :param shard_id: either w1, w2, or w3
1274
+ :param loaded_weight: checkpoint weight to load into the param
1275
+ :param tp_rank: tensor parallel rank
1276
+ :param load_full_w2: whether or not the w2 loaded should be sharded.
1277
+ """
1278
+ if shard_id == "w2":
1279
+ # In the case where we have actorder/g_idx, we do not partition the
1280
+ # w2 scales, as indicated by `load_full` argument, for all tp cases
1281
+ self._load_w2(shard_dim=shard_dim,
1282
+ loaded_weight=loaded_weight,
1283
+ expert_data=expert_data,
1284
+ tp_rank=tp_rank,
1285
+ load_full=load_full_w2)
1286
+ elif shard_id in ("w1", "w3"):
1287
+ self._load_w13(shard_id=shard_id,
1288
+ shard_dim=shard_dim,
1289
+ loaded_weight=loaded_weight,
1290
+ expert_data=expert_data,
1291
+ tp_rank=tp_rank)
1292
+
1293
+ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
1294
+ shard_dim: int, shard_id: str,
1295
+ loaded_weight: torch.Tensor,
1296
+ tp_rank: int):
1297
+ # for per channel weight quantization
1298
+ if shard_id == "w2":
1299
+ expert_data.copy_(loaded_weight)
1300
+ elif shard_id in ("w1", "w3"):
1301
+ self._load_w13(shard_id=shard_id,
1302
+ shard_dim=shard_dim,
1303
+ loaded_weight=loaded_weight,
1304
+ expert_data=expert_data,
1305
+ tp_rank=tp_rank)
1306
+
1307
+ def _load_w13(self,
1308
+ expert_data: torch.Tensor,
1309
+ shard_dim: int,
1310
+ shard_id: str,
1311
+ loaded_weight: torch.Tensor,
1312
+ tp_rank: int,
1313
+ load_full: bool = False):
1314
+
1315
+ # Index the loaded weight for tp sharding.
1316
+ # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
1317
+ shard_size = expert_data.shape[shard_dim] // 2
1318
+ if not load_full:
1319
+ loaded_weight = loaded_weight.narrow(shard_dim,
1320
+ shard_size * tp_rank,
1321
+ shard_size)
1322
+ # Narrow parameter and load.
1323
+ # w1, gate_proj: Load into first logical weight of w13.
1324
+ if shard_id == "w1":
1325
+ expert_data = expert_data.narrow(shard_dim, 0, shard_size)
1326
+ # w3, up_proj: Load into second logical weight of w13.
1327
+ else:
1328
+ assert shard_id == "w3"
1329
+ expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
1330
+ expert_data.copy_(loaded_weight)
1331
+
1332
+ def _load_w2(self,
1333
+ expert_data: torch.Tensor,
1334
+ shard_dim: int,
1335
+ loaded_weight: torch.Tensor,
1336
+ tp_rank: int,
1337
+ load_full: bool = False):
1338
+
1339
+ # Index the loaded weight for tp sharding.
1340
+ # down_proj: "RowParallel" so tp sharding on input_dim
1341
+ # Narrow parameter and load.
1342
+ shard_size = expert_data.shape[shard_dim]
1343
+ if not load_full:
1344
+ loaded_weight = loaded_weight.narrow(shard_dim,
1345
+ shard_size * tp_rank,
1346
+ shard_size)
1347
+ # w2, down_proj: Load into only logical weight of w2.
1348
+ expert_data.copy_(loaded_weight)
1349
+
1350
+ def _load_single_value(self, param: torch.nn.Parameter,
1351
+ loaded_weight: torch.Tensor, expert_id: int):
1352
+ param_data = param.data
1353
+
1354
+ # Input scales can be loaded directly and should be equal.
1355
+ param_data[expert_id] = loaded_weight
1356
+
1357
+ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
1358
+ shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
1359
+
1360
+ if shard_id == "w2":
1361
+ self._load_w2(shard_dim=shard_dim,
1362
+ loaded_weight=loaded_weight,
1363
+ expert_data=expert_data,
1364
+ tp_rank=tp_rank)
1365
+ else:
1366
+ assert shard_id in ("w1", "w3")
1367
+ expert_data.copy_(loaded_weight)
1368
+
1369
+ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
1370
+ if self.expert_map is None:
1371
+ return expert_id
1372
+ return self.expert_map[expert_id].item()
1373
+
1374
+ @overload
1375
+ def weight_loader(self, param: torch.nn.Parameter,
1376
+ loaded_weight: torch.Tensor, weight_name: str,
1377
+ shard_id: str, expert_id: int,
1378
+ return_success: Literal[False]) -> None:
1379
+ ...
1380
+
1381
+ @overload
1382
+ def weight_loader(self, param: torch.nn.Parameter,
1383
+ loaded_weight: torch.Tensor, weight_name: str,
1384
+ shard_id: str, expert_id: int,
1385
+ return_success: Literal[True]) -> bool:
1386
+ ...
1387
+
1388
+ def weight_loader(self,
1389
+ param: torch.nn.Parameter,
1390
+ loaded_weight: torch.Tensor,
1391
+ weight_name: str,
1392
+ shard_id: str,
1393
+ expert_id: int,
1394
+ return_success: bool = False) -> Optional[bool]:
1395
+
1396
+ if self.quant_config and self.quant_config.get_name() == "mxfp4":
1397
+ # (FIXME) for gpt-oss all experts are combined
1398
+ if "bias" in weight_name:
1399
+ dim1 = loaded_weight.shape[1]
1400
+ param.data[:, :dim1].copy_(loaded_weight)
1401
+ else:
1402
+ dim1 = loaded_weight.shape[1]
1403
+ dim2 = loaded_weight.shape[2]
1404
+ param.data[:, :dim1, :dim2].copy_(loaded_weight)
1405
+ return True if return_success else None
1406
+
1407
+ expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
1408
+ if expert_id == -1:
1409
+ # Failed to load this param since it's not local to this rank
1410
+ return False if return_success else None
1411
+ # Hereafter, `expert_id` is local physical id
1412
+
1413
+ quant_method_name = self.quant_method.__class__.__name__
1414
+ # compressed-tensors checkpoints with packed weights are stored flipped
1415
+ # TODO (mgoin): check self.quant_method.quant_config.quant_format
1416
+ # against known CompressionFormat enum values that have this quality
1417
+ if self.quant_method.__class__.__name__ in (
1418
+ "CompressedTensorsWNA16MarlinMoEMethod",
1419
+ "CompressedTensorsWNA16MoEMethod"):
1420
+ loaded_weight = loaded_weight.t().contiguous()
1421
+
1422
+ if shard_id not in ("w1", "w2", "w3"):
1423
+ raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
1424
+ f"got {shard_id}.")
1425
+
1426
+ # Fetch the dim to shard the parameter/loaded weight
1427
+ # based on the shard id. This will be whatever
1428
+ # dimension intermediate_size_per_partition is used.
1429
+ SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
1430
+
1431
+ is_gguf_weight = getattr(param, "is_gguf_weight", False)
1432
+ is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
1433
+ if is_gguf_weight_type:
1434
+ param.weight_type = loaded_weight.item()
1435
+ param.data.copy_(loaded_weight)
1436
+ return True if return_success else None
1437
+
1438
+ # Case for BitsAndBytes
1439
+ use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
1440
+ if use_bitsandbytes_4bit:
1441
+ shard_dim = 0
1442
+
1443
+ expert_data = param.data[expert_id]
1444
+ if shard_id == "w2":
1445
+ expert_data.copy_(loaded_weight)
1446
+ elif shard_id in ("w1", "w3"):
1447
+ # BNB inflight quantization has already sharded the weights
1448
+ full_load = True
1449
+ self._load_w13(
1450
+ shard_id=shard_id,
1451
+ shard_dim=shard_dim,
1452
+ loaded_weight=loaded_weight,
1453
+ expert_data=expert_data,
1454
+ tp_rank=self.tp_rank,
1455
+ load_full=full_load,
1456
+ )
1457
+ return True if return_success else None
1458
+
1459
+ # is_transposed: if the dim to shard the weight
1460
+ # should be flipped. Required by GPTQ, compressed-tensors
1461
+ # should be whatever dimension intermediate_size_per_partition is
1462
+ is_transposed = getattr(param, "is_transposed", False)
1463
+ shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
1464
+ if is_transposed:
1465
+ shard_dim = int(not shard_dim)
1466
+
1467
+ full_load = len(loaded_weight.shape) == 3
1468
+ if full_load:
1469
+ shard_dim += 1
1470
+
1471
+ # Materialize GGUF UninitializedParameter
1472
+ if is_gguf_weight and isinstance(param, UninitializedParameter):
1473
+ final_shape = list(loaded_weight.shape)
1474
+ if shard_id in ["w1", "w3"]:
1475
+ final_shape[1] *= 2
1476
+ final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
1477
+ param.materialize(final_shape, dtype=loaded_weight.dtype)
1478
+
1479
+ expert_data = param.data if full_load else param.data[expert_id]
1480
+
1481
+ # Case input scale: input_scale loading is only supported for fp8
1482
+ if "input_scale" in weight_name:
1483
+ # this is needed for compressed-tensors only
1484
+ loaded_weight = loaded_weight.to(param.data.device)
1485
+
1486
+ if ("compressed" in quant_method_name.lower()
1487
+ and param.data[expert_id] != 1
1488
+ and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
1489
+ raise ValueError(
1490
+ "input_scales of w1 and w3 of a layer "
1491
+ f"must be equal. But got {param.data[expert_id]} "
1492
+ f"vs. {loaded_weight}")
1493
+
1494
+ self._load_single_value(param=param,
1495
+ loaded_weight=loaded_weight,
1496
+ expert_id=expert_id)
1497
+ return True if return_success else None
1498
+
1499
+ # Case g_idx
1500
+ if "g_idx" in weight_name:
1501
+ self._load_g_idx(shard_dim=0,
1502
+ shard_id=shard_id,
1503
+ loaded_weight=loaded_weight,
1504
+ expert_data=expert_data,
1505
+ tp_rank=self.tp_rank)
1506
+ return True if return_success else None
1507
+
1508
+ # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
1509
+ if "ModelOpt" in quant_method_name:
1510
+ # Determine per-tensor weight scale patterns based on variant
1511
+ # Use the dedicated method instead of brittle string matching
1512
+ uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern(
1513
+ )
1514
+
1515
+ # Call _load_per_tensor_weight_scale() to load per-tensor (scalar)
1516
+ # weights scales.
1517
+ # Input scales are always per-tensor.
1518
+ # Weight scales: FP4 uses "weight_scale_2" and FP8 uses
1519
+ # "weight_scale" for per-tensor scales.
1520
+ is_per_tensor = ("weight_scale_2" in weight_name
1521
+ if uses_weight_scale_2 else "weight_scale"
1522
+ in weight_name) or "input_scale" in weight_name
1523
+ if is_per_tensor:
1524
+ self._load_per_tensor_weight_scale(
1525
+ shard_id=shard_id,
1526
+ param=param,
1527
+ loaded_weight=loaded_weight,
1528
+ expert_id=expert_id,
1529
+ )
1530
+ return True if return_success else None
1531
+
1532
+ # If the weight is w13_weight_scale and w13_weight_scales are
1533
+ # combined into single loaded_weight, call
1534
+ # _load_combined_w13_weight_scale() to load it.
1535
+ # This is checked by comparing the hidden_out dims of the
1536
+ # loaded_weight and the param.
1537
+ if "w13_weight_scale" in weight_name:
1538
+ loaded_weight_hidden_out = loaded_weight.shape[-2]
1539
+ param_hidden_out = param.data.shape[-2] * self.tp_size
1540
+ if loaded_weight_hidden_out == param_hidden_out:
1541
+ self._load_combined_w13_weight_scale(
1542
+ shard_dim=shard_dim,
1543
+ loaded_weight=loaded_weight,
1544
+ param=param,
1545
+ tp_rank=self.tp_rank,
1546
+ )
1547
+ return True if return_success else None
1548
+
1549
+ # For other weights, call _load_model_weight_or_group_weight_scale()
1550
+ # to load it.
1551
+ if "weight" in weight_name:
1552
+ self._load_model_weight_or_group_weight_scale(
1553
+ shard_id=shard_id,
1554
+ shard_dim=shard_dim,
1555
+ loaded_weight=loaded_weight,
1556
+ expert_data=expert_data,
1557
+ tp_rank=self.tp_rank)
1558
+ return True if return_success else None
1559
+
1560
+ # Case weight scales, zero_points and offset, weight/input global scales
1561
+ if ("scale" in weight_name or "zero" in weight_name
1562
+ or "offset" in weight_name):
1563
+ # load the weight scales and zp based on the quantization scheme
1564
+ # supported weight scales/zp can be found in
1565
+ # FusedMoeWeightScaleSupported
1566
+ # TODO @dsikka: once hardened, refactor to use vLLM Parameters
1567
+ # specific to each case
1568
+ quant_method = getattr(param, "quant_method", None)
1569
+ if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
1570
+ self._load_per_channel_weight_scale(
1571
+ shard_id=shard_id,
1572
+ shard_dim=shard_dim,
1573
+ loaded_weight=loaded_weight,
1574
+ expert_data=expert_data,
1575
+ tp_rank=self.tp_rank)
1576
+ elif quant_method in [
1577
+ FusedMoeWeightScaleSupported.GROUP.value,
1578
+ FusedMoeWeightScaleSupported.BLOCK.value,
1579
+ ]:
1580
+ self._load_model_weight_or_group_weight_scale(
1581
+ shard_id=shard_id,
1582
+ shard_dim=shard_dim,
1583
+ loaded_weight=loaded_weight,
1584
+ expert_data=expert_data,
1585
+ tp_rank=self.tp_rank,
1586
+ load_full_w2=getattr(param, "load_full_w2", False))
1587
+ elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
1588
+ self._load_per_tensor_weight_scale(shard_id=shard_id,
1589
+ param=param,
1590
+ loaded_weight=loaded_weight,
1591
+ expert_id=expert_id)
1592
+ else:
1593
+ WEIGHT_SCALE_SUPPORTED = [
1594
+ e.value for e in FusedMoeWeightScaleSupported
1595
+ ]
1596
+ raise ValueError(
1597
+ f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
1598
+ return True if return_success else None
1599
+
1600
+ # Case weight_shape
1601
+ if "weight_shape" in weight_name:
1602
+ # only required by compressed-tensors
1603
+ self._load_single_value(param=param,
1604
+ loaded_weight=loaded_weight,
1605
+ expert_id=expert_id)
1606
+ return True if return_success else None
1607
+
1608
+ # Case model weights
1609
+ if "weight" in weight_name:
1610
+ self._load_model_weight_or_group_weight_scale(
1611
+ shard_id=shard_id,
1612
+ shard_dim=shard_dim,
1613
+ loaded_weight=loaded_weight,
1614
+ expert_data=expert_data,
1615
+ tp_rank=self.tp_rank)
1616
+ return True if return_success else None
1617
+
1618
+ return False if return_success else None
1619
+
1620
+ def get_expert_weights(self) -> Iterable[torch.Tensor]:
1621
+ weights = list(self.named_parameters())
1622
+ assert all(weight.is_contiguous() for _, weight in weights)
1623
+
1624
+ # Filter out the non-expert weights.
1625
+ # `e_score_correction_bias` is a bias for each logical expert,
1626
+ # with shape (num_logical_experts,), not an expert weight.
1627
+ NON_EXPERT_WEIGHTS = {
1628
+ "e_score_correction_bias",
1629
+ }
1630
+
1631
+ return [
1632
+ weight.view(self.local_num_experts, -1) for name, weight in weights
1633
+ if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size(
1634
+ []) and not name.startswith("_shared_experts.")
1635
+ ]
1636
+
1637
+ def set_eplb_state(
1638
+ self,
1639
+ moe_layer_idx: int,
1640
+ expert_load_view: torch.Tensor,
1641
+ logical_to_physical_map: torch.Tensor,
1642
+ logical_replica_count: torch.Tensor,
1643
+ ) -> None:
1644
+ """
1645
+ Register the EPLB state in this layer.
1646
+
1647
+ This is used later in forward pass, where we get the expert mapping
1648
+ and record the load metrics in `expert_load_view`.
1649
+ """
1650
+ self.expert_load_view = expert_load_view[moe_layer_idx]
1651
+ self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx]
1652
+ self.logical_replica_count = logical_replica_count[moe_layer_idx]
1653
+
1654
+ def ensure_moe_quant_config(self):
1655
+ if self.quant_method.moe_quant_config is None:
1656
+ self.quant_method.moe_quant_config = (
1657
+ self.quant_method.get_fused_moe_quant_config(self))
1658
+
1659
+ @staticmethod
1660
+ def select_experts(
1661
+ hidden_states: torch.Tensor,
1662
+ router_logits: torch.Tensor,
1663
+ top_k: int,
1664
+ use_grouped_topk: bool,
1665
+ renormalize: bool,
1666
+ topk_group: Optional[int] = None,
1667
+ num_expert_group: Optional[int] = None,
1668
+ custom_routing_function: Optional[Callable] = None,
1669
+ scoring_func: str = "softmax",
1670
+ routed_scaling_factor: float = 1.0,
1671
+ e_score_correction_bias: Optional[torch.Tensor] = None,
1672
+ indices_type: Optional[torch.dtype] = None,
1673
+ enable_eplb: bool = False,
1674
+ expert_map: Optional[torch.Tensor] = None,
1675
+ expert_load_view: Optional[torch.Tensor] = None,
1676
+ logical_to_physical_map: Optional[torch.Tensor] = None,
1677
+ logical_replica_count: Optional[torch.Tensor] = None,
1678
+ global_num_experts: Optional[int] = None,
1679
+ zero_expert_num: Optional[int] = None,
1680
+ zero_expert_type: Optional[str] = None,
1681
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
1682
+ """
1683
+ Route the input hidden states to the top-k experts based on the
1684
+ router logits.
1685
+
1686
+ Returns:
1687
+ (topk_weights, topk_ids, zero_expert_result)
1688
+ (tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
1689
+ The weights, expert ids, and zero expert computation result.
1690
+
1691
+ **Compatibility**: When EPLB is not enabled, the returned ids are
1692
+ equivalent to global logical ids, so should be compatible with
1693
+ plain MoE implementations without redundant experts.
1694
+ """
1695
+ from vllm.model_executor.layers.fused_moe.fused_moe import (
1696
+ fused_topk, fused_topk_bias)
1697
+
1698
+ # Check if we should use a routing simulation strategy
1699
+ routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
1700
+ if routing_strategy != "":
1701
+ topk_weights, topk_ids = RoutingSimulator.simulate_routing(
1702
+ hidden_states=hidden_states,
1703
+ router_logits=router_logits,
1704
+ strategy_name=routing_strategy,
1705
+ top_k=top_k,
1706
+ indices_type=indices_type)
1707
+
1708
+ # DeepSeekv2 uses grouped_top_k
1709
+ if use_grouped_topk:
1710
+ assert topk_group is not None
1711
+ assert num_expert_group is not None
1712
+ topk_weights, topk_ids = grouped_topk(
1713
+ hidden_states=hidden_states,
1714
+ gating_output=router_logits,
1715
+ topk=top_k,
1716
+ renormalize=renormalize,
1717
+ num_expert_group=num_expert_group,
1718
+ topk_group=topk_group,
1719
+ scoring_func=scoring_func,
1720
+ routed_scaling_factor=routed_scaling_factor,
1721
+ e_score_correction_bias=e_score_correction_bias)
1722
+ if indices_type is not None:
1723
+ topk_ids = topk_ids.to(dtype=indices_type)
1724
+ elif e_score_correction_bias is not None:
1725
+ topk_weights, topk_ids = fused_topk_bias(
1726
+ hidden_states=hidden_states,
1727
+ gating_output=router_logits,
1728
+ e_score_correction_bias=e_score_correction_bias.data,
1729
+ topk=top_k,
1730
+ renormalize=renormalize,
1731
+ )
1732
+ if routed_scaling_factor is not None:
1733
+ topk_weights *= routed_scaling_factor
1734
+ elif custom_routing_function is None:
1735
+ topk_weights, topk_ids, token_expert_indices = fused_topk(
1736
+ hidden_states=hidden_states,
1737
+ gating_output=router_logits,
1738
+ topk=top_k,
1739
+ renormalize=renormalize,
1740
+ indices_type=indices_type,
1741
+ )
1742
+ else:
1743
+ topk_weights, topk_ids = custom_routing_function(
1744
+ hidden_states=hidden_states,
1745
+ gating_output=router_logits,
1746
+ topk=top_k,
1747
+ renormalize=renormalize)
1748
+ if indices_type is not None:
1749
+ topk_ids = topk_ids.to(dtype=indices_type)
1750
+
1751
+ if enable_eplb:
1752
+ assert expert_load_view is not None
1753
+ assert logical_to_physical_map is not None
1754
+ assert logical_replica_count is not None
1755
+
1756
+ topk_ids = eplb_map_to_physical_and_record(
1757
+ topk_ids=topk_ids,
1758
+ expert_load_view=expert_load_view,
1759
+ logical_to_physical_map=logical_to_physical_map,
1760
+ logical_replica_count=logical_replica_count,
1761
+ indices_type=indices_type,
1762
+ )
1763
+
1764
+ assert topk_ids.dtype == indices_type or indices_type is None
1765
+
1766
+ # Compute zero expert result if needed
1767
+ if (zero_expert_num is not None and zero_expert_num > 0
1768
+ and zero_expert_type is not None
1769
+ and global_num_experts is not None):
1770
+ zero_expert_result = zero_experts_compute_triton(
1771
+ expert_indices=topk_ids,
1772
+ expert_scales=topk_weights,
1773
+ num_experts=global_num_experts,
1774
+ zero_expert_type=zero_expert_type,
1775
+ hidden_states=hidden_states,
1776
+ )
1777
+ else:
1778
+ zero_expert_result = None
1779
+ return topk_weights, topk_ids, zero_expert_result
1780
+
1781
+ def must_reduce_shared_expert_outputs(self) -> bool:
1782
+ """
1783
+ The shared_experts are typically computed using the RowParallelLinear
1784
+ layer. The result of this function is typically used as
1785
+ the reduce_results argument to the module.
1786
+ When just tensor-parallel is used, it is not required to reduce
1787
+ the shared_experts results immediately. Instead we reduce at the
1788
+ once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
1789
+ With EP and all2all kernels - this is no longer viable as all
1790
+ GPU ranks in DP, produce the complete set of hidden_states.
1791
+ Therefore it is required that we reduce the shared_experts output
1792
+ early.
1793
+ """
1794
+ return (self.use_pplx_kernels or self.use_deepep_ht_kernels
1795
+ or self.use_deepep_ll_kernels)
1796
+
1797
+ def maybe_all_reduce_tensor_model_parallel(
1798
+ self, final_hidden_states: torch.Tensor):
1799
+ """
1800
+ The pplx combine kernel reduces across GPU ranks by default.
1801
+ """
1802
+ if (self.use_pplx_kernels or self.use_deepep_ht_kernels
1803
+ or self.use_deepep_ll_kernels):
1804
+ return final_hidden_states
1805
+ else:
1806
+ return tensor_model_parallel_all_reduce(final_hidden_states)
1807
+
1808
+ def forward_native(
1809
+ self,
1810
+ hidden_states: torch.Tensor,
1811
+ router_logits: torch.Tensor,
1812
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
1813
+ og_hidden_states = hidden_states.shape[-1]
1814
+ if self.hidden_size != og_hidden_states:
1815
+ hidden_states = F.pad(hidden_states,
1816
+ (0, self.hidden_size - og_hidden_states),
1817
+ mode='constant',
1818
+ value=0.0)
1819
+
1820
+ if self.shared_experts is None:
1821
+ if current_platform.is_tpu():
1822
+ # TODO: Once the OOM issue for the TPU backend is resolved, we
1823
+ # will switch to using the moe_forward custom op.
1824
+ fused_output = self.forward_impl(hidden_states, router_logits)
1825
+ assert not isinstance(fused_output, tuple)
1826
+ else:
1827
+ fused_output = torch.ops.vllm.moe_forward(
1828
+ hidden_states, router_logits, self.layer_name)
1829
+ return fused_output[..., :og_hidden_states]
1830
+ else:
1831
+ if current_platform.is_tpu():
1832
+ # TODO: Once the OOM issue for the TPU backend is resolved, we
1833
+ # will switch to using the moe_forward custom op.
1834
+ shared_output, fused_output = self.forward_impl(
1835
+ hidden_states, router_logits)
1836
+ else:
1837
+ shared_output, fused_output = torch.ops.vllm.moe_forward_shared(
1838
+ hidden_states, router_logits, self.layer_name)
1839
+ return (shared_output[..., :og_hidden_states],
1840
+ fused_output[..., :og_hidden_states])
1841
+
1842
+ def forward_cuda(
1843
+ self,
1844
+ hidden_states: torch.Tensor,
1845
+ router_logits: torch.Tensor,
1846
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
1847
+ return self.forward_native(hidden_states, router_logits)
1848
+
1849
+ def forward_impl_chunked(
1850
+ self,
1851
+ full_hidden_states: torch.Tensor,
1852
+ full_router_logits: torch.Tensor,
1853
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
1854
+ assert self.batched_hidden_states is not None
1855
+ assert self.batched_router_logits is not None
1856
+ assert self.batched_hidden_states.dtype == full_hidden_states.dtype
1857
+ assert self.batched_router_logits.dtype == full_router_logits.dtype
1858
+ # Check size compatibility.
1859
+ assert (
1860
+ self.batched_hidden_states.size(-1) == full_hidden_states.size(-1))
1861
+ assert (
1862
+ self.batched_router_logits.size(-1) == full_router_logits.size(-1))
1863
+
1864
+ self.ensure_moe_quant_config()
1865
+
1866
+ full_fused_final_hidden_states = torch.empty_like(full_hidden_states)
1867
+ if self.shared_experts is not None:
1868
+ full_shared_final_hidden_states = torch.empty_like(
1869
+ full_hidden_states)
1870
+
1871
+ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
1872
+ chunk_size = chunk_end - chunk_start
1873
+ hidden_states = full_hidden_states[chunk_start:chunk_end, :]
1874
+ router_logits = full_router_logits[chunk_start:chunk_end, :]
1875
+
1876
+ assert self.batched_hidden_states is not None
1877
+ assert self.batched_router_logits is not None
1878
+ # This is only true when DBO has been enabled in the config.
1879
+ # Both tensors will have an outer dimension for the ubatch id
1880
+ if self.batched_hidden_states.dim() == 3:
1881
+ assert self.batched_router_logits.dim() == 3
1882
+ batch_buffer_idx = dbo_current_ubatch_id()
1883
+ batched_hidden_states = self.batched_hidden_states[
1884
+ batch_buffer_idx, :]
1885
+ batched_router_logits = self.batched_router_logits[
1886
+ batch_buffer_idx, :]
1887
+ else:
1888
+ batched_hidden_states = self.batched_hidden_states
1889
+ batched_router_logits = self.batched_router_logits
1890
+
1891
+ assert (batched_hidden_states.size(0) # type: ignore
1892
+ >= chunk_size)
1893
+ assert (batched_router_logits.size(0) # type: ignore
1894
+ >= chunk_size)
1895
+ staged_hidden_states = batched_hidden_states[:
1896
+ chunk_size, :] # type: ignore
1897
+ staged_router_logits = batched_router_logits[:
1898
+ chunk_size, :] # type: ignore
1899
+ staged_hidden_states.copy_(hidden_states, non_blocking=True)
1900
+ staged_router_logits.copy_(router_logits, non_blocking=True)
1901
+
1902
+ # Matrix multiply.
1903
+ final_hidden_states = self.quant_method.apply(
1904
+ layer=self,
1905
+ x=staged_hidden_states,
1906
+ router_logits=staged_router_logits,
1907
+ top_k=self.top_k,
1908
+ renormalize=self.renormalize,
1909
+ use_grouped_topk=self.use_grouped_topk,
1910
+ global_num_experts=self.global_num_experts,
1911
+ expert_map=self.expert_map,
1912
+ topk_group=self.topk_group,
1913
+ num_expert_group=self.num_expert_group,
1914
+ custom_routing_function=self.custom_routing_function,
1915
+ scoring_func=self.scoring_func,
1916
+ routed_scaling_factor=self.routed_scaling_factor,
1917
+ e_score_correction_bias=self.e_score_correction_bias,
1918
+ activation=self.activation,
1919
+ enable_eplb=self.enable_eplb,
1920
+ expert_load_view=self.expert_load_view,
1921
+ logical_to_physical_map=self.logical_to_physical_map,
1922
+ logical_replica_count=self.logical_replica_count,
1923
+ )
1924
+
1925
+ assert self.shared_experts is None or isinstance(
1926
+ final_hidden_states, tuple)
1927
+
1928
+ if self.zero_expert_num is not None and self.zero_expert_num > 0:
1929
+ assert isinstance(final_hidden_states, tuple)
1930
+ assert self.shared_experts is None
1931
+ final_hidden_states, zero_expert_result = final_hidden_states
1932
+ if zero_expert_result is not None:
1933
+ final_hidden_states += zero_expert_result
1934
+
1935
+ if not skip_result_store:
1936
+ if self.shared_experts is None:
1937
+ full_fused_final_hidden_states[
1938
+ chunk_start:chunk_end, :].copy_(final_hidden_states,
1939
+ non_blocking=True)
1940
+ else:
1941
+ full_shared_final_hidden_states[
1942
+ chunk_start:chunk_end, :].copy_(final_hidden_states[0],
1943
+ non_blocking=True)
1944
+ full_fused_final_hidden_states[
1945
+ chunk_start:chunk_end, :].copy_(final_hidden_states[1],
1946
+ non_blocking=True)
1947
+
1948
+ ctx = get_forward_context()
1949
+ # flashinfer_cutlass_kernels can handle: optional DP + TP/EP
1950
+ max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu
1951
+ moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
1952
+
1953
+ # If the input to the MoE is sequence parallel then divide by sp_size
1954
+ # to find the maximum number of tokens for any individual dispatcher.
1955
+ if self.is_sequence_parallel:
1956
+ max_tokens_across_dispatchers = cdiv(max_tokens_across_dispatchers,
1957
+ self.sp_size)
1958
+
1959
+ num_tokens = full_hidden_states.size(0)
1960
+ for chunk_idx, chunk_start_ in enumerate(
1961
+ range(0, max_tokens_across_dispatchers,
1962
+ moe_dp_chunk_size_per_rank)):
1963
+ chunk_start = chunk_start_
1964
+ chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
1965
+ max_tokens_across_dispatchers)
1966
+ # clamp start and end
1967
+ chunk_start = min(chunk_start, num_tokens - 1)
1968
+ chunk_end = min(chunk_end, num_tokens)
1969
+ with ctx.dp_metadata.chunked_sizes(self.sp_size,
1970
+ moe_dp_chunk_size_per_rank,
1971
+ chunk_idx):
1972
+ process_chunk(chunk_start,
1973
+ chunk_end,
1974
+ skip_result_store=chunk_start_ >= num_tokens)
1975
+
1976
+ if self.shared_experts is None:
1977
+ return full_fused_final_hidden_states
1978
+ else:
1979
+ return (full_shared_final_hidden_states,
1980
+ full_fused_final_hidden_states)
1981
+
1982
+ def forward_impl(
1983
+ self,
1984
+ hidden_states: torch.Tensor,
1985
+ router_logits: torch.Tensor,
1986
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
1987
+ assert self.quant_method is not None
1988
+
1989
+ self.ensure_moe_quant_config()
1990
+
1991
+ # Route to the chunked forward path using the FlashInfer Cutlass kernel
1992
+ # only when data parallelism (DP) is enabled.
1993
+ _use_flashinfer_cutlass_kernels = (self.dp_size > 1 and
1994
+ self.use_flashinfer_cutlass_kernels)
1995
+
1996
+ if (self.moe_parallel_config.use_pplx_kernels
1997
+ or self.moe_parallel_config.use_deepep_ll_kernels
1998
+ or _use_flashinfer_cutlass_kernels):
1999
+ return self.forward_impl_chunked(hidden_states, router_logits)
2000
+
2001
+ do_naive_dispatch_combine: bool = (
2002
+ self.dp_size > 1
2003
+ and not self.moe_parallel_config.use_deepep_ht_kernels
2004
+ and not self.moe_config.use_flashinfer_cutlass_kernels)
2005
+
2006
+ # If there are shared experts but we are not using a modular kernel, the
2007
+ # shared experts must be called here
2008
+ if (not isinstance(self.quant_method.fused_experts,
2009
+ FusedMoEModularKernel)
2010
+ and self.shared_experts is not None):
2011
+ shared_output = self.shared_experts(hidden_states)
2012
+ else:
2013
+ shared_output = None
2014
+
2015
+ ctx = get_forward_context()
2016
+ sp_ctx = ctx.dp_metadata.sp_local_sizes(
2017
+ self.sp_size) if ctx.dp_metadata else nullcontext()
2018
+
2019
+ with sp_ctx:
2020
+ if do_naive_dispatch_combine:
2021
+ hidden_states, router_logits = get_ep_group().dispatch(
2022
+ hidden_states, router_logits, self.is_sequence_parallel)
2023
+
2024
+ # Matrix multiply.
2025
+ final_hidden_states = self.quant_method.apply(
2026
+ layer=self,
2027
+ x=hidden_states,
2028
+ router_logits=router_logits,
2029
+ top_k=self.top_k,
2030
+ renormalize=self.renormalize,
2031
+ use_grouped_topk=self.use_grouped_topk,
2032
+ global_num_experts=self.global_num_experts,
2033
+ expert_map=self.expert_map,
2034
+ topk_group=self.topk_group,
2035
+ num_expert_group=self.num_expert_group,
2036
+ custom_routing_function=self.custom_routing_function,
2037
+ scoring_func=self.scoring_func,
2038
+ routed_scaling_factor=self.routed_scaling_factor,
2039
+ e_score_correction_bias=self.e_score_correction_bias,
2040
+ activation=self.activation,
2041
+ apply_router_weight_on_input=self.apply_router_weight_on_input,
2042
+ enable_eplb=self.enable_eplb,
2043
+ expert_load_view=self.expert_load_view,
2044
+ logical_to_physical_map=self.logical_to_physical_map,
2045
+ logical_replica_count=self.logical_replica_count,
2046
+ )
2047
+
2048
+ if shared_output is not None:
2049
+ assert not isinstance(final_hidden_states, tuple)
2050
+ assert self.shared_experts is not None
2051
+ final_hidden_states = (
2052
+ shared_output,
2053
+ final_hidden_states,
2054
+ )
2055
+ elif self.zero_expert_num is not None and self.zero_expert_num > 0:
2056
+ assert isinstance(final_hidden_states, tuple)
2057
+ final_hidden_states, zero_expert_result = final_hidden_states
2058
+
2059
+ def reduce_output(states: torch.Tensor,
2060
+ do_combine: bool = True) -> torch.Tensor:
2061
+ if do_naive_dispatch_combine and do_combine:
2062
+ states = get_ep_group().combine(states,
2063
+ self.is_sequence_parallel)
2064
+
2065
+ if (not self.is_sequence_parallel and self.reduce_results
2066
+ and (self.tp_size > 1 or self.ep_size > 1)):
2067
+ states = self.maybe_all_reduce_tensor_model_parallel(
2068
+ states)
2069
+
2070
+ return states
2071
+
2072
+ if self.shared_experts is not None:
2073
+ return (
2074
+ reduce_output(final_hidden_states[0], do_combine=False),
2075
+ reduce_output(final_hidden_states[1]),
2076
+ )
2077
+ elif self.zero_expert_num is not None and self.zero_expert_num > 0:
2078
+ assert isinstance(final_hidden_states, torch.Tensor)
2079
+ return reduce_output(final_hidden_states) + zero_expert_result
2080
+ else:
2081
+ return reduce_output(final_hidden_states)
2082
+
2083
+ @classmethod
2084
+ def make_expert_params_mapping(
2085
+ cls,
2086
+ ckpt_gate_proj_name: str,
2087
+ ckpt_down_proj_name: str,
2088
+ ckpt_up_proj_name: str,
2089
+ num_experts: int,
2090
+ num_redundant_experts: int = 0) -> list[tuple[str, str, int, str]]:
2091
+
2092
+ num_physical_experts = num_experts + num_redundant_experts
2093
+
2094
+ # In the returned mapping:
2095
+ # - `expert_id` is the physical expert id
2096
+ # - `weight_name` contains the weight name of the logical expert
2097
+ # So that we should map the expert id to logical in `weight_name`
2098
+ physical_to_logical_map = \
2099
+ EplbState.build_initial_global_physical_to_logical_map(
2100
+ num_experts, num_redundant_experts)
2101
+
2102
+ return [
2103
+ # (param_name, weight_name, expert_id, shard_id)
2104
+ ("experts.w13_" if weight_name
2105
+ in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
2106
+ f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.",
2107
+ expert_id, shard_id) for expert_id in range(num_physical_experts)
2108
+ for shard_id, weight_name in [
2109
+ ("w1", ckpt_gate_proj_name),
2110
+ ("w2", ckpt_down_proj_name),
2111
+ ("w3", ckpt_up_proj_name),
2112
+ ]
2113
+ ]
2114
+
2115
+ def extra_repr(self) -> str:
2116
+
2117
+ s = (
2118
+ f"global_num_experts={self.global_num_experts}, "
2119
+ f"local_num_experts={self.local_num_experts}, "
2120
+ f"top_k={self.top_k}, "
2121
+ f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501
2122
+ f"tp_size={self.tp_size},\n"
2123
+ f"ep_size={self.ep_size}, "
2124
+ f"reduce_results={self.reduce_results}, "
2125
+ f"renormalize={self.renormalize}, "
2126
+ f"use_grouped_topk={self.use_grouped_topk}")
2127
+
2128
+ if self.use_grouped_topk:
2129
+ s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}" # noqa: E501
2130
+
2131
+ s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'" # noqa: E501
2132
+
2133
+ return s
2134
+
2135
+
2136
+ def moe_forward(
2137
+ hidden_states: torch.Tensor,
2138
+ router_logits: torch.Tensor,
2139
+ layer_name: str,
2140
+ ) -> torch.Tensor:
2141
+ forward_context: ForwardContext = get_forward_context()
2142
+ self = forward_context.no_compile_layers[layer_name]
2143
+ assert self.shared_experts is None
2144
+ return self.forward_impl(hidden_states, router_logits)
2145
+
2146
+
2147
+ def moe_forward_fake(
2148
+ hidden_states: torch.Tensor,
2149
+ router_logits: torch.Tensor,
2150
+ layer_name: str,
2151
+ ) -> torch.Tensor:
2152
+ return torch.empty_like(hidden_states)
2153
+
2154
+
2155
+ direct_register_custom_op(
2156
+ op_name="moe_forward",
2157
+ op_func=moe_forward,
2158
+ mutates_args=["hidden_states"],
2159
+ fake_impl=moe_forward_fake,
2160
+ tags=(torch.Tag.needs_fixed_stride_order, ),
2161
+ )
2162
+
2163
+
2164
+ def moe_forward_shared(
2165
+ hidden_states: torch.Tensor,
2166
+ router_logits: torch.Tensor,
2167
+ layer_name: str,
2168
+ ) -> tuple[torch.Tensor, torch.Tensor]:
2169
+ forward_context: ForwardContext = get_forward_context()
2170
+ self = forward_context.no_compile_layers[layer_name]
2171
+ assert self.shared_experts is not None
2172
+ return self.forward_impl(hidden_states, router_logits)
2173
+
2174
+
2175
+ def moe_forward_shared_fake(
2176
+ hidden_states: torch.Tensor,
2177
+ router_logits: torch.Tensor,
2178
+ layer_name: str,
2179
+ ) -> tuple[torch.Tensor, torch.Tensor]:
2180
+ shared_out = torch.empty_like(hidden_states)
2181
+ fused_out = torch.empty_like(hidden_states)
2182
+ return shared_out, fused_out
2183
+
2184
+
2185
+ direct_register_custom_op(
2186
+ op_name="moe_forward_shared",
2187
+ op_func=moe_forward_shared,
2188
+ mutates_args=["hidden_states"],
2189
+ fake_impl=moe_forward_shared_fake,
2190
+ tags=(torch.Tag.needs_fixed_stride_order, ),
2191
+ )
2192
+
2193
+ # Mark the FusedMoE weight_loader as supporting MoE-specific parameters
2194
+ # to avoid expensive runtime reflection in model loading code
2195
+ FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined]