vllm-cpu-avx512vnni 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1395) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2022 -0
  5. vllm/_ipex_ops.py +404 -0
  6. vllm/_version.py +34 -0
  7. vllm/adapter_commons/__init__.py +0 -0
  8. vllm/adapter_commons/layers.py +16 -0
  9. vllm/adapter_commons/models.py +106 -0
  10. vllm/adapter_commons/request.py +26 -0
  11. vllm/adapter_commons/utils.py +93 -0
  12. vllm/adapter_commons/worker_manager.py +39 -0
  13. vllm/assets/__init__.py +0 -0
  14. vllm/assets/audio.py +45 -0
  15. vllm/assets/base.py +41 -0
  16. vllm/assets/image.py +50 -0
  17. vllm/assets/video.py +138 -0
  18. vllm/attention/__init__.py +19 -0
  19. vllm/attention/backends/__init__.py +0 -0
  20. vllm/attention/backends/abstract.py +348 -0
  21. vllm/attention/backends/differential_flash_attn.py +935 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1499 -0
  23. vllm/attention/backends/flash_attn.py +933 -0
  24. vllm/attention/backends/flashmla.py +238 -0
  25. vllm/attention/backends/mla/__init__.py +0 -0
  26. vllm/attention/backends/mla/common.py +1310 -0
  27. vllm/attention/backends/placeholder_attn.py +340 -0
  28. vllm/attention/backends/rocm_aiter_mla.py +410 -0
  29. vllm/attention/backends/rocm_flash_attn.py +953 -0
  30. vllm/attention/backends/triton_mla.py +111 -0
  31. vllm/attention/backends/utils.py +610 -0
  32. vllm/attention/backends/xformers.py +805 -0
  33. vllm/attention/layer.py +552 -0
  34. vllm/attention/layers/__init__.py +0 -0
  35. vllm/attention/layers/chunked_local_attention.py +91 -0
  36. vllm/attention/layers/cross_attention.py +159 -0
  37. vllm/attention/layers/encoder_only_attention.py +86 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  40. vllm/attention/ops/common.py +139 -0
  41. vllm/attention/ops/flashmla.py +123 -0
  42. vllm/attention/ops/merge_attn_states.py +43 -0
  43. vllm/attention/ops/paged_attn.py +261 -0
  44. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  45. vllm/attention/ops/prefix_prefill.py +928 -0
  46. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  47. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  48. vllm/attention/ops/triton_decode_attention.py +676 -0
  49. vllm/attention/ops/triton_flash_attention.py +984 -0
  50. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  51. vllm/attention/ops/triton_unified_attention.py +854 -0
  52. vllm/attention/selector.py +243 -0
  53. vllm/attention/utils/__init__.py +0 -0
  54. vllm/attention/utils/fa_utils.py +85 -0
  55. vllm/attention/utils/kv_sharing_utils.py +33 -0
  56. vllm/beam_search.py +87 -0
  57. vllm/benchmarks/__init__.py +0 -0
  58. vllm/benchmarks/datasets.py +2651 -0
  59. vllm/benchmarks/latency.py +170 -0
  60. vllm/benchmarks/lib/__init__.py +3 -0
  61. vllm/benchmarks/lib/endpoint_request_func.py +510 -0
  62. vllm/benchmarks/lib/ready_checker.py +72 -0
  63. vllm/benchmarks/lib/utils.py +80 -0
  64. vllm/benchmarks/serve.py +1247 -0
  65. vllm/benchmarks/throughput.py +696 -0
  66. vllm/collect_env.py +823 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/activation_quant_fusion.py +193 -0
  69. vllm/compilation/backends.py +641 -0
  70. vllm/compilation/base_static_graph.py +51 -0
  71. vllm/compilation/collective_fusion.py +1190 -0
  72. vllm/compilation/compiler_interface.py +572 -0
  73. vllm/compilation/counter.py +47 -0
  74. vllm/compilation/cuda_graph.py +193 -0
  75. vllm/compilation/cuda_piecewise_backend.py +117 -0
  76. vllm/compilation/decorators.py +316 -0
  77. vllm/compilation/fix_functionalization.py +208 -0
  78. vllm/compilation/fusion.py +600 -0
  79. vllm/compilation/fusion_attn.py +303 -0
  80. vllm/compilation/fx_utils.py +84 -0
  81. vllm/compilation/inductor_pass.py +136 -0
  82. vllm/compilation/monitor.py +57 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +165 -0
  85. vllm/compilation/pass_manager.py +88 -0
  86. vllm/compilation/sequence_parallelism.py +484 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +50 -0
  89. vllm/compilation/wrapper.py +138 -0
  90. vllm/config/__init__.py +3921 -0
  91. vllm/config/cache.py +214 -0
  92. vllm/config/compilation.py +580 -0
  93. vllm/config/kv_events.py +50 -0
  94. vllm/config/kv_transfer.py +111 -0
  95. vllm/config/load.py +113 -0
  96. vllm/config/lora.py +132 -0
  97. vllm/config/parallel.py +446 -0
  98. vllm/config/scheduler.py +304 -0
  99. vllm/config/utils.py +29 -0
  100. vllm/connections.py +174 -0
  101. vllm/core/__init__.py +0 -0
  102. vllm/core/block/__init__.py +0 -0
  103. vllm/core/block/block_table.py +399 -0
  104. vllm/core/block/common.py +371 -0
  105. vllm/core/block/cpu_gpu_block_allocator.py +439 -0
  106. vllm/core/block/interfaces.py +319 -0
  107. vllm/core/block/naive_block.py +466 -0
  108. vllm/core/block/prefix_caching_block.py +1135 -0
  109. vllm/core/block/utils.py +28 -0
  110. vllm/core/block_manager.py +523 -0
  111. vllm/core/evictor.py +157 -0
  112. vllm/core/interfaces.py +139 -0
  113. vllm/core/placeholder_block_space_manager.py +103 -0
  114. vllm/core/scheduler.py +2028 -0
  115. vllm/device_allocator/__init__.py +0 -0
  116. vllm/device_allocator/cumem.py +286 -0
  117. vllm/distributed/__init__.py +6 -0
  118. vllm/distributed/communication_op.py +41 -0
  119. vllm/distributed/device_communicators/__init__.py +0 -0
  120. vllm/distributed/device_communicators/all2all.py +259 -0
  121. vllm/distributed/device_communicators/all_reduce_utils.py +292 -0
  122. vllm/distributed/device_communicators/base_device_communicator.py +277 -0
  123. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  124. vllm/distributed/device_communicators/cuda_communicator.py +294 -0
  125. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  126. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  127. vllm/distributed/device_communicators/pynccl.py +290 -0
  128. vllm/distributed/device_communicators/pynccl_wrapper.py +382 -0
  129. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  130. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  131. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  132. vllm/distributed/device_communicators/symm_mem.py +136 -0
  133. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  134. vllm/distributed/device_communicators/xpu_communicator.py +69 -0
  135. vllm/distributed/eplb/__init__.py +8 -0
  136. vllm/distributed/eplb/eplb_state.py +619 -0
  137. vllm/distributed/eplb/rebalance_algo.py +234 -0
  138. vllm/distributed/eplb/rebalance_execute.py +424 -0
  139. vllm/distributed/kv_events.py +362 -0
  140. vllm/distributed/kv_transfer/README.md +29 -0
  141. vllm/distributed/kv_transfer/__init__.py +13 -0
  142. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  143. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  145. vllm/distributed/kv_transfer/kv_connector/factory.py +108 -0
  146. vllm/distributed/kv_transfer/kv_connector/utils.py +246 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/base.py +356 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +266 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1319 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +484 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +542 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +266 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +414 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  158. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  159. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  160. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  161. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  162. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  163. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  164. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  165. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  166. vllm/distributed/parallel_state.py +1489 -0
  167. vllm/distributed/tpu_distributed_utils.py +178 -0
  168. vllm/distributed/utils.py +536 -0
  169. vllm/engine/__init__.py +0 -0
  170. vllm/engine/arg_utils.py +1857 -0
  171. vllm/engine/async_llm_engine.py +1044 -0
  172. vllm/engine/async_timeout.py +173 -0
  173. vllm/engine/llm_engine.py +1849 -0
  174. vllm/engine/metrics.py +577 -0
  175. vllm/engine/metrics_types.py +84 -0
  176. vllm/engine/multiprocessing/__init__.py +145 -0
  177. vllm/engine/multiprocessing/client.py +643 -0
  178. vllm/engine/multiprocessing/engine.py +470 -0
  179. vllm/engine/output_processor/__init__.py +0 -0
  180. vllm/engine/output_processor/interfaces.py +61 -0
  181. vllm/engine/output_processor/single_step.py +145 -0
  182. vllm/engine/output_processor/stop_checker.py +131 -0
  183. vllm/engine/output_processor/util.py +28 -0
  184. vllm/engine/protocol.py +343 -0
  185. vllm/entrypoints/__init__.py +0 -0
  186. vllm/entrypoints/api_server.py +178 -0
  187. vllm/entrypoints/chat_utils.py +1535 -0
  188. vllm/entrypoints/cli/__init__.py +12 -0
  189. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  190. vllm/entrypoints/cli/benchmark/base.py +25 -0
  191. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  192. vllm/entrypoints/cli/benchmark/main.py +58 -0
  193. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  194. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  195. vllm/entrypoints/cli/collect_env.py +36 -0
  196. vllm/entrypoints/cli/main.py +60 -0
  197. vllm/entrypoints/cli/openai.py +214 -0
  198. vllm/entrypoints/cli/run_batch.py +69 -0
  199. vllm/entrypoints/cli/serve.py +232 -0
  200. vllm/entrypoints/cli/types.py +29 -0
  201. vllm/entrypoints/constants.py +10 -0
  202. vllm/entrypoints/context.py +444 -0
  203. vllm/entrypoints/harmony_utils.py +431 -0
  204. vllm/entrypoints/launcher.py +168 -0
  205. vllm/entrypoints/llm.py +1579 -0
  206. vllm/entrypoints/logger.py +79 -0
  207. vllm/entrypoints/openai/__init__.py +0 -0
  208. vllm/entrypoints/openai/api_server.py +2011 -0
  209. vllm/entrypoints/openai/cli_args.py +281 -0
  210. vllm/entrypoints/openai/logits_processors.py +90 -0
  211. vllm/entrypoints/openai/protocol.py +2590 -0
  212. vllm/entrypoints/openai/run_batch.py +497 -0
  213. vllm/entrypoints/openai/serving_chat.py +1591 -0
  214. vllm/entrypoints/openai/serving_classification.py +176 -0
  215. vllm/entrypoints/openai/serving_completion.py +688 -0
  216. vllm/entrypoints/openai/serving_embedding.py +632 -0
  217. vllm/entrypoints/openai/serving_engine.py +996 -0
  218. vllm/entrypoints/openai/serving_models.py +288 -0
  219. vllm/entrypoints/openai/serving_pooling.py +277 -0
  220. vllm/entrypoints/openai/serving_responses.py +1690 -0
  221. vllm/entrypoints/openai/serving_score.py +479 -0
  222. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  223. vllm/entrypoints/openai/serving_transcription.py +136 -0
  224. vllm/entrypoints/openai/speech_to_text.py +388 -0
  225. vllm/entrypoints/openai/tool_parsers/__init__.py +51 -0
  226. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  227. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  228. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  229. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  230. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  231. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  232. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +418 -0
  233. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  234. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  235. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  236. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  237. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  238. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  239. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  240. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  241. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +73 -0
  242. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  243. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  244. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  245. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  246. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  247. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  248. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  249. vllm/entrypoints/renderer.py +395 -0
  250. vllm/entrypoints/score_utils.py +232 -0
  251. vllm/entrypoints/ssl.py +75 -0
  252. vllm/entrypoints/tool.py +139 -0
  253. vllm/entrypoints/tool_server.py +195 -0
  254. vllm/entrypoints/utils.py +328 -0
  255. vllm/env_override.py +23 -0
  256. vllm/envs.py +1354 -0
  257. vllm/executor/__init__.py +0 -0
  258. vllm/executor/executor_base.py +378 -0
  259. vllm/executor/mp_distributed_executor.py +244 -0
  260. vllm/executor/msgspec_utils.py +35 -0
  261. vllm/executor/multiproc_worker_utils.py +279 -0
  262. vllm/executor/ray_distributed_executor.py +699 -0
  263. vllm/executor/ray_utils.py +410 -0
  264. vllm/executor/uniproc_executor.py +152 -0
  265. vllm/forward_context.py +273 -0
  266. vllm/inputs/__init__.py +44 -0
  267. vllm/inputs/data.py +356 -0
  268. vllm/inputs/parse.py +151 -0
  269. vllm/inputs/preprocess.py +973 -0
  270. vllm/inputs/registry.py +251 -0
  271. vllm/logger.py +229 -0
  272. vllm/logging_utils/__init__.py +8 -0
  273. vllm/logging_utils/dump_input.py +81 -0
  274. vllm/logging_utils/formatter.py +79 -0
  275. vllm/logits_process.py +119 -0
  276. vllm/logprobs.py +28 -0
  277. vllm/lora/__init__.py +0 -0
  278. vllm/lora/layers/__init__.py +34 -0
  279. vllm/lora/layers/base.py +69 -0
  280. vllm/lora/layers/base_linear.py +184 -0
  281. vllm/lora/layers/column_parallel_linear.py +622 -0
  282. vllm/lora/layers/logits_processor.py +247 -0
  283. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  284. vllm/lora/layers/replicated_linear.py +61 -0
  285. vllm/lora/layers/row_parallel_linear.py +201 -0
  286. vllm/lora/layers/utils.py +60 -0
  287. vllm/lora/layers/vocal_parallel_embedding.py +172 -0
  288. vllm/lora/lora.py +199 -0
  289. vllm/lora/models.py +792 -0
  290. vllm/lora/ops/__init__.py +0 -0
  291. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  292. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  293. vllm/lora/ops/torch_ops/__init__.py +16 -0
  294. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  295. vllm/lora/ops/triton_ops/__init__.py +12 -0
  296. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  297. vllm/lora/ops/triton_ops/lora_expand_op.py +291 -0
  298. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  299. vllm/lora/ops/triton_ops/lora_shrink_op.py +245 -0
  300. vllm/lora/ops/triton_ops/utils.py +126 -0
  301. vllm/lora/ops/xla_ops/__init__.py +7 -0
  302. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  303. vllm/lora/peft_helper.py +127 -0
  304. vllm/lora/punica_wrapper/__init__.py +10 -0
  305. vllm/lora/punica_wrapper/punica_base.py +458 -0
  306. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  307. vllm/lora/punica_wrapper/punica_gpu.py +279 -0
  308. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  309. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  310. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  311. vllm/lora/punica_wrapper/utils.py +136 -0
  312. vllm/lora/request.py +99 -0
  313. vllm/lora/resolver.py +85 -0
  314. vllm/lora/utils.py +246 -0
  315. vllm/lora/worker_manager.py +256 -0
  316. vllm/model_executor/__init__.py +16 -0
  317. vllm/model_executor/custom_op.py +194 -0
  318. vllm/model_executor/layers/__init__.py +0 -0
  319. vllm/model_executor/layers/activation.py +575 -0
  320. vllm/model_executor/layers/attention_layer_base.py +23 -0
  321. vllm/model_executor/layers/fla/__init__.py +8 -0
  322. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  323. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  324. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  325. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  326. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  327. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  328. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  329. vllm/model_executor/layers/fla/ops/index.py +39 -0
  330. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  331. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  332. vllm/model_executor/layers/fla/ops/op.py +39 -0
  333. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  334. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  335. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  336. vllm/model_executor/layers/fused_moe/__init__.py +80 -0
  337. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +304 -0
  338. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +164 -0
  339. vllm/model_executor/layers/fused_moe/config.py +497 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  560. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +297 -0
  561. vllm/model_executor/layers/fused_moe/cutlass_moe.py +996 -0
  562. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +370 -0
  563. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  564. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +280 -0
  565. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +229 -0
  566. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +243 -0
  567. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +97 -0
  568. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1042 -0
  569. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +240 -0
  570. vllm/model_executor/layers/fused_moe/fused_moe.py +2081 -0
  571. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +247 -0
  572. vllm/model_executor/layers/fused_moe/layer.py +1951 -0
  573. vllm/model_executor/layers/fused_moe/modular_kernel.py +892 -0
  574. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  575. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  576. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  577. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  578. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +321 -0
  579. vllm/model_executor/layers/fused_moe/prepare_finalize.py +72 -0
  580. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +431 -0
  581. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  582. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  583. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +171 -0
  584. vllm/model_executor/layers/fused_moe/trtllm_moe.py +197 -0
  585. vllm/model_executor/layers/fused_moe/utils.py +270 -0
  586. vllm/model_executor/layers/layernorm.py +381 -0
  587. vllm/model_executor/layers/lightning_attn.py +661 -0
  588. vllm/model_executor/layers/linear.py +1567 -0
  589. vllm/model_executor/layers/logits_processor.py +199 -0
  590. vllm/model_executor/layers/mamba/__init__.py +0 -0
  591. vllm/model_executor/layers/mamba/abstract.py +45 -0
  592. vllm/model_executor/layers/mamba/linear_attn.py +432 -0
  593. vllm/model_executor/layers/mamba/mamba2_metadata.py +186 -0
  594. vllm/model_executor/layers/mamba/mamba_mixer.py +517 -0
  595. vllm/model_executor/layers/mamba/mamba_mixer2.py +803 -0
  596. vllm/model_executor/layers/mamba/mamba_utils.py +202 -0
  597. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  598. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +982 -0
  599. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  600. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  601. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  602. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +574 -0
  603. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  604. vllm/model_executor/layers/mamba/ops/ssd_combined.py +248 -0
  605. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +248 -0
  606. vllm/model_executor/layers/mamba/short_conv.py +270 -0
  607. vllm/model_executor/layers/mla.py +158 -0
  608. vllm/model_executor/layers/pooler.py +732 -0
  609. vllm/model_executor/layers/quantization/__init__.py +157 -0
  610. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  611. vllm/model_executor/layers/quantization/awq.py +228 -0
  612. vllm/model_executor/layers/quantization/awq_marlin.py +548 -0
  613. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  614. vllm/model_executor/layers/quantization/base_config.py +164 -0
  615. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  616. vllm/model_executor/layers/quantization/bitsandbytes.py +621 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +795 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1651 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  625. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +161 -0
  626. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  627. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  628. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  629. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +156 -0
  630. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  631. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  632. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +227 -0
  633. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +135 -0
  634. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +21 -0
  635. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  636. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  637. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  638. vllm/model_executor/layers/quantization/deepgemm.py +81 -0
  639. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  640. vllm/model_executor/layers/quantization/experts_int8.py +215 -0
  641. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  642. vllm/model_executor/layers/quantization/fp8.py +1179 -0
  643. vllm/model_executor/layers/quantization/gguf.py +597 -0
  644. vllm/model_executor/layers/quantization/gptq.py +300 -0
  645. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  646. vllm/model_executor/layers/quantization/gptq_marlin.py +700 -0
  647. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  648. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  649. vllm/model_executor/layers/quantization/inc.py +61 -0
  650. vllm/model_executor/layers/quantization/input_quant_fp8.py +103 -0
  651. vllm/model_executor/layers/quantization/ipex_quant.py +410 -0
  652. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  653. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  654. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  655. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  656. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  657. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  658. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  659. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  660. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  661. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  662. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  663. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  664. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  665. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +163 -0
  666. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  667. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  668. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  669. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  670. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  671. vllm/model_executor/layers/quantization/modelopt.py +1548 -0
  672. vllm/model_executor/layers/quantization/moe_wna16.py +473 -0
  673. vllm/model_executor/layers/quantization/mxfp4.py +951 -0
  674. vllm/model_executor/layers/quantization/petit.py +306 -0
  675. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  676. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  677. vllm/model_executor/layers/quantization/quark/quark.py +431 -0
  678. vllm/model_executor/layers/quantization/quark/quark_moe.py +434 -0
  679. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  680. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  681. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +112 -0
  682. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  683. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  684. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  685. vllm/model_executor/layers/quantization/rtn.py +456 -0
  686. vllm/model_executor/layers/quantization/schema.py +86 -0
  687. vllm/model_executor/layers/quantization/torchao.py +214 -0
  688. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  689. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  690. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  691. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  903. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +85 -0
  904. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +258 -0
  905. vllm/model_executor/layers/quantization/utils/fp8_utils.py +795 -0
  906. vllm/model_executor/layers/quantization/utils/gptq_utils.py +96 -0
  907. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  908. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  909. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  910. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  911. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  912. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  913. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  914. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  915. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +132 -0
  916. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  917. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  918. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  919. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  920. vllm/model_executor/layers/quantization/utils/quant_utils.py +627 -0
  921. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  922. vllm/model_executor/layers/resampler.py +270 -0
  923. vllm/model_executor/layers/rotary_embedding/__init__.py +190 -0
  924. vllm/model_executor/layers/rotary_embedding/base.py +156 -0
  925. vllm/model_executor/layers/rotary_embedding/common.py +105 -0
  926. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +140 -0
  927. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  928. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  929. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  930. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  931. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  932. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  933. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  934. vllm/model_executor/layers/rotary_embedding/mrope.py +1140 -0
  935. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  936. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  937. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  938. vllm/model_executor/layers/sampler.py +1198 -0
  939. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  940. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  941. vllm/model_executor/layers/utils.py +196 -0
  942. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  943. vllm/model_executor/model_loader/__init__.py +138 -0
  944. vllm/model_executor/model_loader/base_loader.py +52 -0
  945. vllm/model_executor/model_loader/bitsandbytes_loader.py +787 -0
  946. vllm/model_executor/model_loader/default_loader.py +278 -0
  947. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  948. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  949. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  950. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  951. vllm/model_executor/model_loader/tensorizer.py +743 -0
  952. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  953. vllm/model_executor/model_loader/tpu.py +114 -0
  954. vllm/model_executor/model_loader/utils.py +271 -0
  955. vllm/model_executor/model_loader/weight_utils.py +946 -0
  956. vllm/model_executor/models/__init__.py +30 -0
  957. vllm/model_executor/models/adapters.py +542 -0
  958. vllm/model_executor/models/aimv2.py +246 -0
  959. vllm/model_executor/models/apertus.py +582 -0
  960. vllm/model_executor/models/arcee.py +423 -0
  961. vllm/model_executor/models/arctic.py +560 -0
  962. vllm/model_executor/models/aria.py +662 -0
  963. vllm/model_executor/models/aya_vision.py +470 -0
  964. vllm/model_executor/models/baichuan.py +475 -0
  965. vllm/model_executor/models/bailing_moe.py +529 -0
  966. vllm/model_executor/models/bamba.py +582 -0
  967. vllm/model_executor/models/bart.py +1343 -0
  968. vllm/model_executor/models/bert.py +613 -0
  969. vllm/model_executor/models/bert_with_rope.py +687 -0
  970. vllm/model_executor/models/blip.py +339 -0
  971. vllm/model_executor/models/blip2.py +716 -0
  972. vllm/model_executor/models/bloom.py +374 -0
  973. vllm/model_executor/models/chameleon.py +1141 -0
  974. vllm/model_executor/models/chatglm.py +479 -0
  975. vllm/model_executor/models/clip.py +407 -0
  976. vllm/model_executor/models/cohere2_vision.py +484 -0
  977. vllm/model_executor/models/commandr.py +467 -0
  978. vllm/model_executor/models/config.py +434 -0
  979. vllm/model_executor/models/constant_size_cache.py +137 -0
  980. vllm/model_executor/models/dbrx.py +473 -0
  981. vllm/model_executor/models/deepseek.py +491 -0
  982. vllm/model_executor/models/deepseek_eagle.py +241 -0
  983. vllm/model_executor/models/deepseek_mtp.py +282 -0
  984. vllm/model_executor/models/deepseek_v2.py +1058 -0
  985. vllm/model_executor/models/deepseek_vl2.py +661 -0
  986. vllm/model_executor/models/donut.py +387 -0
  987. vllm/model_executor/models/dots1.py +547 -0
  988. vllm/model_executor/models/ernie45.py +43 -0
  989. vllm/model_executor/models/ernie45_moe.py +608 -0
  990. vllm/model_executor/models/ernie45_vl.py +1510 -0
  991. vllm/model_executor/models/ernie45_vl_moe.py +728 -0
  992. vllm/model_executor/models/ernie_mtp.py +287 -0
  993. vllm/model_executor/models/exaone.py +552 -0
  994. vllm/model_executor/models/exaone4.py +535 -0
  995. vllm/model_executor/models/fairseq2_llama.py +154 -0
  996. vllm/model_executor/models/falcon.py +511 -0
  997. vllm/model_executor/models/falcon_h1.py +739 -0
  998. vllm/model_executor/models/florence2.py +1107 -0
  999. vllm/model_executor/models/fuyu.py +401 -0
  1000. vllm/model_executor/models/gemma.py +428 -0
  1001. vllm/model_executor/models/gemma2.py +425 -0
  1002. vllm/model_executor/models/gemma3.py +542 -0
  1003. vllm/model_executor/models/gemma3_mm.py +723 -0
  1004. vllm/model_executor/models/gemma3n.py +830 -0
  1005. vllm/model_executor/models/gemma3n_mm.py +767 -0
  1006. vllm/model_executor/models/glm.py +23 -0
  1007. vllm/model_executor/models/glm4.py +305 -0
  1008. vllm/model_executor/models/glm4_1v.py +1669 -0
  1009. vllm/model_executor/models/glm4_moe.py +703 -0
  1010. vllm/model_executor/models/glm4_moe_mtp.py +306 -0
  1011. vllm/model_executor/models/glm4v.py +654 -0
  1012. vllm/model_executor/models/gpt2.py +383 -0
  1013. vllm/model_executor/models/gpt_bigcode.py +346 -0
  1014. vllm/model_executor/models/gpt_j.py +340 -0
  1015. vllm/model_executor/models/gpt_neox.py +333 -0
  1016. vllm/model_executor/models/gpt_oss.py +687 -0
  1017. vllm/model_executor/models/granite.py +498 -0
  1018. vllm/model_executor/models/granite_speech.py +799 -0
  1019. vllm/model_executor/models/granitemoe.py +541 -0
  1020. vllm/model_executor/models/granitemoehybrid.py +684 -0
  1021. vllm/model_executor/models/granitemoeshared.py +342 -0
  1022. vllm/model_executor/models/gritlm.py +262 -0
  1023. vllm/model_executor/models/grok1.py +550 -0
  1024. vllm/model_executor/models/h2ovl.py +536 -0
  1025. vllm/model_executor/models/hunyuan_v1.py +937 -0
  1026. vllm/model_executor/models/hyperclovax_vision.py +1206 -0
  1027. vllm/model_executor/models/idefics2_vision_model.py +416 -0
  1028. vllm/model_executor/models/idefics3.py +758 -0
  1029. vllm/model_executor/models/interfaces.py +854 -0
  1030. vllm/model_executor/models/interfaces_base.py +195 -0
  1031. vllm/model_executor/models/intern_vit.py +481 -0
  1032. vllm/model_executor/models/internlm2.py +453 -0
  1033. vllm/model_executor/models/internlm2_ve.py +148 -0
  1034. vllm/model_executor/models/interns1.py +832 -0
  1035. vllm/model_executor/models/interns1_vit.py +418 -0
  1036. vllm/model_executor/models/internvl.py +1423 -0
  1037. vllm/model_executor/models/jais.py +374 -0
  1038. vllm/model_executor/models/jamba.py +630 -0
  1039. vllm/model_executor/models/jina_vl.py +144 -0
  1040. vllm/model_executor/models/keye.py +1684 -0
  1041. vllm/model_executor/models/keye_vl1_5.py +601 -0
  1042. vllm/model_executor/models/kimi_vl.py +620 -0
  1043. vllm/model_executor/models/lfm2.py +558 -0
  1044. vllm/model_executor/models/llama.py +671 -0
  1045. vllm/model_executor/models/llama4.py +732 -0
  1046. vllm/model_executor/models/llama4_eagle.py +241 -0
  1047. vllm/model_executor/models/llama_eagle.py +171 -0
  1048. vllm/model_executor/models/llama_eagle3.py +292 -0
  1049. vllm/model_executor/models/llava.py +872 -0
  1050. vllm/model_executor/models/llava_next.py +572 -0
  1051. vllm/model_executor/models/llava_next_video.py +479 -0
  1052. vllm/model_executor/models/llava_onevision.py +945 -0
  1053. vllm/model_executor/models/mamba.py +310 -0
  1054. vllm/model_executor/models/mamba2.py +346 -0
  1055. vllm/model_executor/models/mamba_cache.py +83 -0
  1056. vllm/model_executor/models/medusa.py +219 -0
  1057. vllm/model_executor/models/midashenglm.py +788 -0
  1058. vllm/model_executor/models/mimo.py +191 -0
  1059. vllm/model_executor/models/mimo_mtp.py +273 -0
  1060. vllm/model_executor/models/minicpm.py +593 -0
  1061. vllm/model_executor/models/minicpm3.py +230 -0
  1062. vllm/model_executor/models/minicpm_eagle.py +391 -0
  1063. vllm/model_executor/models/minicpmo.py +804 -0
  1064. vllm/model_executor/models/minicpmv.py +1786 -0
  1065. vllm/model_executor/models/minimax_cache.py +36 -0
  1066. vllm/model_executor/models/minimax_text_01.py +1027 -0
  1067. vllm/model_executor/models/minimax_vl_01.py +431 -0
  1068. vllm/model_executor/models/mistral3.py +628 -0
  1069. vllm/model_executor/models/mixtral.py +494 -0
  1070. vllm/model_executor/models/mllama.py +1697 -0
  1071. vllm/model_executor/models/mllama4.py +1079 -0
  1072. vllm/model_executor/models/mlp_speculator.py +206 -0
  1073. vllm/model_executor/models/modernbert.py +374 -0
  1074. vllm/model_executor/models/module_mapping.py +72 -0
  1075. vllm/model_executor/models/molmo.py +1569 -0
  1076. vllm/model_executor/models/moonvit.py +663 -0
  1077. vllm/model_executor/models/motif.py +345 -0
  1078. vllm/model_executor/models/mpt.py +332 -0
  1079. vllm/model_executor/models/nano_nemotron_vl.py +1395 -0
  1080. vllm/model_executor/models/nemotron.py +509 -0
  1081. vllm/model_executor/models/nemotron_h.py +633 -0
  1082. vllm/model_executor/models/nemotron_nas.py +484 -0
  1083. vllm/model_executor/models/nemotron_vl.py +655 -0
  1084. vllm/model_executor/models/nvlm_d.py +203 -0
  1085. vllm/model_executor/models/olmo.py +406 -0
  1086. vllm/model_executor/models/olmo2.py +428 -0
  1087. vllm/model_executor/models/olmoe.py +485 -0
  1088. vllm/model_executor/models/opt.py +413 -0
  1089. vllm/model_executor/models/orion.py +350 -0
  1090. vllm/model_executor/models/ovis.py +572 -0
  1091. vllm/model_executor/models/ovis2_5.py +644 -0
  1092. vllm/model_executor/models/paligemma.py +414 -0
  1093. vllm/model_executor/models/persimmon.py +345 -0
  1094. vllm/model_executor/models/phi.py +357 -0
  1095. vllm/model_executor/models/phi3.py +19 -0
  1096. vllm/model_executor/models/phi3v.py +701 -0
  1097. vllm/model_executor/models/phi4_multimodal.py +1478 -0
  1098. vllm/model_executor/models/phi4flash.py +737 -0
  1099. vllm/model_executor/models/phi4mm.py +1281 -0
  1100. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1101. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1102. vllm/model_executor/models/phimoe.py +681 -0
  1103. vllm/model_executor/models/pixtral.py +1348 -0
  1104. vllm/model_executor/models/plamo2.py +1126 -0
  1105. vllm/model_executor/models/qwen.py +363 -0
  1106. vllm/model_executor/models/qwen2.py +526 -0
  1107. vllm/model_executor/models/qwen2_5_omni_thinker.py +985 -0
  1108. vllm/model_executor/models/qwen2_5_vl.py +1256 -0
  1109. vllm/model_executor/models/qwen2_audio.py +492 -0
  1110. vllm/model_executor/models/qwen2_moe.py +558 -0
  1111. vllm/model_executor/models/qwen2_rm.py +122 -0
  1112. vllm/model_executor/models/qwen2_vl.py +1512 -0
  1113. vllm/model_executor/models/qwen3.py +344 -0
  1114. vllm/model_executor/models/qwen3_moe.py +704 -0
  1115. vllm/model_executor/models/qwen3_next.py +1298 -0
  1116. vllm/model_executor/models/qwen3_next_mtp.py +285 -0
  1117. vllm/model_executor/models/qwen_vl.py +795 -0
  1118. vllm/model_executor/models/registry.py +891 -0
  1119. vllm/model_executor/models/roberta.py +252 -0
  1120. vllm/model_executor/models/rvl.py +103 -0
  1121. vllm/model_executor/models/seed_oss.py +488 -0
  1122. vllm/model_executor/models/siglip.py +524 -0
  1123. vllm/model_executor/models/siglip2navit.py +688 -0
  1124. vllm/model_executor/models/skyworkr1v.py +914 -0
  1125. vllm/model_executor/models/smolvlm.py +44 -0
  1126. vllm/model_executor/models/solar.py +506 -0
  1127. vllm/model_executor/models/stablelm.py +344 -0
  1128. vllm/model_executor/models/starcoder2.py +357 -0
  1129. vllm/model_executor/models/step3_text.py +521 -0
  1130. vllm/model_executor/models/step3_vl.py +1091 -0
  1131. vllm/model_executor/models/swin.py +475 -0
  1132. vllm/model_executor/models/tarsier.py +649 -0
  1133. vllm/model_executor/models/telechat2.py +151 -0
  1134. vllm/model_executor/models/teleflm.py +79 -0
  1135. vllm/model_executor/models/terratorch.py +294 -0
  1136. vllm/model_executor/models/transformers.py +883 -0
  1137. vllm/model_executor/models/ultravox.py +667 -0
  1138. vllm/model_executor/models/utils.py +770 -0
  1139. vllm/model_executor/models/vision.py +125 -0
  1140. vllm/model_executor/models/voxtral.py +789 -0
  1141. vllm/model_executor/models/whisper.py +966 -0
  1142. vllm/model_executor/models/zamba2.py +1056 -0
  1143. vllm/model_executor/parameter.py +599 -0
  1144. vllm/model_executor/sampling_metadata.py +597 -0
  1145. vllm/model_executor/utils.py +97 -0
  1146. vllm/model_executor/warmup/__init__.py +0 -0
  1147. vllm/model_executor/warmup/deep_gemm_warmup.py +223 -0
  1148. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1149. vllm/multimodal/__init__.py +35 -0
  1150. vllm/multimodal/audio.py +116 -0
  1151. vllm/multimodal/base.py +219 -0
  1152. vllm/multimodal/cache.py +507 -0
  1153. vllm/multimodal/hasher.py +110 -0
  1154. vllm/multimodal/image.py +130 -0
  1155. vllm/multimodal/inputs.py +979 -0
  1156. vllm/multimodal/parse.py +496 -0
  1157. vllm/multimodal/processing.py +1921 -0
  1158. vllm/multimodal/profiling.py +313 -0
  1159. vllm/multimodal/registry.py +375 -0
  1160. vllm/multimodal/utils.py +754 -0
  1161. vllm/multimodal/video.py +312 -0
  1162. vllm/outputs.py +517 -0
  1163. vllm/platforms/__init__.py +263 -0
  1164. vllm/platforms/cpu.py +353 -0
  1165. vllm/platforms/cuda.py +731 -0
  1166. vllm/platforms/interface.py +599 -0
  1167. vllm/platforms/rocm.py +504 -0
  1168. vllm/platforms/tpu.py +236 -0
  1169. vllm/platforms/xpu.py +243 -0
  1170. vllm/plugins/__init__.py +72 -0
  1171. vllm/plugins/io_processors/__init__.py +68 -0
  1172. vllm/plugins/io_processors/interface.py +67 -0
  1173. vllm/plugins/lora_resolvers/README.md +16 -0
  1174. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1175. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1176. vllm/pooling_params.py +183 -0
  1177. vllm/profiler/__init__.py +0 -0
  1178. vllm/profiler/layerwise_profile.py +375 -0
  1179. vllm/profiler/utils.py +148 -0
  1180. vllm/py.typed +2 -0
  1181. vllm/ray/__init__.py +0 -0
  1182. vllm/ray/lazy_utils.py +22 -0
  1183. vllm/ray/ray_env.py +72 -0
  1184. vllm/reasoning/__init__.py +25 -0
  1185. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1186. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1187. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1188. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1189. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1190. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1191. vllm/reasoning/mistral_reasoning_parser.py +47 -0
  1192. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1193. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1194. vllm/sampling_params.py +577 -0
  1195. vllm/scalar_type.py +349 -0
  1196. vllm/scripts.py +15 -0
  1197. vllm/sequence.py +1465 -0
  1198. vllm/tasks.py +11 -0
  1199. vllm/test_utils.py +130 -0
  1200. vllm/third_party/__init__.py +0 -0
  1201. vllm/third_party/pynvml.py +6140 -0
  1202. vllm/tracing.py +136 -0
  1203. vllm/transformers_utils/__init__.py +24 -0
  1204. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1205. vllm/transformers_utils/chat_templates/registry.py +71 -0
  1206. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1207. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1208. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1209. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1210. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1211. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1212. vllm/transformers_utils/config.py +1043 -0
  1213. vllm/transformers_utils/config_parser_base.py +20 -0
  1214. vllm/transformers_utils/configs/__init__.py +55 -0
  1215. vllm/transformers_utils/configs/arctic.py +207 -0
  1216. vllm/transformers_utils/configs/chatglm.py +72 -0
  1217. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1218. vllm/transformers_utils/configs/eagle.py +84 -0
  1219. vllm/transformers_utils/configs/falcon.py +90 -0
  1220. vllm/transformers_utils/configs/jais.py +238 -0
  1221. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1222. vllm/transformers_utils/configs/medusa.py +63 -0
  1223. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1224. vllm/transformers_utils/configs/mistral.py +165 -0
  1225. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1226. vllm/transformers_utils/configs/moonvit.py +33 -0
  1227. vllm/transformers_utils/configs/nemotron.py +205 -0
  1228. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1229. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1230. vllm/transformers_utils/configs/ovis.py +176 -0
  1231. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1232. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1233. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1234. vllm/transformers_utils/configs/speculators/base.py +91 -0
  1235. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1236. vllm/transformers_utils/configs/ultravox.py +120 -0
  1237. vllm/transformers_utils/detokenizer.py +169 -0
  1238. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1239. vllm/transformers_utils/dynamic_module.py +60 -0
  1240. vllm/transformers_utils/processor.py +245 -0
  1241. vllm/transformers_utils/processors/__init__.py +16 -0
  1242. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1243. vllm/transformers_utils/processors/ovis.py +420 -0
  1244. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1245. vllm/transformers_utils/runai_utils.py +99 -0
  1246. vllm/transformers_utils/s3_utils.py +90 -0
  1247. vllm/transformers_utils/tokenizer.py +293 -0
  1248. vllm/transformers_utils/tokenizer_base.py +149 -0
  1249. vllm/transformers_utils/tokenizer_group.py +132 -0
  1250. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1251. vllm/transformers_utils/tokenizers/mistral.py +520 -0
  1252. vllm/transformers_utils/utils.py +99 -0
  1253. vllm/triton_utils/__init__.py +16 -0
  1254. vllm/triton_utils/importing.py +95 -0
  1255. vllm/usage/__init__.py +0 -0
  1256. vllm/usage/usage_lib.py +259 -0
  1257. vllm/utils/__init__.py +3438 -0
  1258. vllm/utils/deep_gemm.py +212 -0
  1259. vllm/utils/flashinfer.py +372 -0
  1260. vllm/utils/jsontree.py +90 -0
  1261. vllm/utils/tensor_schema.py +236 -0
  1262. vllm/v1/__init__.py +0 -0
  1263. vllm/v1/attention/__init__.py +0 -0
  1264. vllm/v1/attention/backends/__init__.py +0 -0
  1265. vllm/v1/attention/backends/cpu_attn.py +922 -0
  1266. vllm/v1/attention/backends/flash_attn.py +800 -0
  1267. vllm/v1/attention/backends/flashinfer.py +1128 -0
  1268. vllm/v1/attention/backends/flex_attention.py +796 -0
  1269. vllm/v1/attention/backends/gdn_attn.py +320 -0
  1270. vllm/v1/attention/backends/linear_attn.py +68 -0
  1271. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1272. vllm/v1/attention/backends/mamba2_attn.py +224 -0
  1273. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1274. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1275. vllm/v1/attention/backends/mla/common.py +1608 -0
  1276. vllm/v1/attention/backends/mla/cutlass_mla.py +301 -0
  1277. vllm/v1/attention/backends/mla/flashattn_mla.py +273 -0
  1278. vllm/v1/attention/backends/mla/flashinfer_mla.py +110 -0
  1279. vllm/v1/attention/backends/mla/flashmla.py +213 -0
  1280. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1281. vllm/v1/attention/backends/mla/triton_mla.py +175 -0
  1282. vllm/v1/attention/backends/pallas.py +413 -0
  1283. vllm/v1/attention/backends/rocm_aiter_fa.py +548 -0
  1284. vllm/v1/attention/backends/short_conv_attn.py +82 -0
  1285. vllm/v1/attention/backends/tree_attn.py +450 -0
  1286. vllm/v1/attention/backends/triton_attn.py +430 -0
  1287. vllm/v1/attention/backends/utils.py +834 -0
  1288. vllm/v1/attention/backends/xformers.py +437 -0
  1289. vllm/v1/core/__init__.py +0 -0
  1290. vllm/v1/core/block_pool.py +330 -0
  1291. vllm/v1/core/encoder_cache_manager.py +333 -0
  1292. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1293. vllm/v1/core/kv_cache_manager.py +398 -0
  1294. vllm/v1/core/kv_cache_utils.py +1169 -0
  1295. vllm/v1/core/sched/__init__.py +0 -0
  1296. vllm/v1/core/sched/async_scheduler.py +47 -0
  1297. vllm/v1/core/sched/interface.py +158 -0
  1298. vllm/v1/core/sched/output.py +162 -0
  1299. vllm/v1/core/sched/request_queue.py +224 -0
  1300. vllm/v1/core/sched/scheduler.py +1287 -0
  1301. vllm/v1/core/sched/utils.py +69 -0
  1302. vllm/v1/core/single_type_kv_cache_manager.py +670 -0
  1303. vllm/v1/cudagraph_dispatcher.py +121 -0
  1304. vllm/v1/engine/__init__.py +202 -0
  1305. vllm/v1/engine/async_llm.py +757 -0
  1306. vllm/v1/engine/coordinator.py +357 -0
  1307. vllm/v1/engine/core.py +1245 -0
  1308. vllm/v1/engine/core_client.py +1333 -0
  1309. vllm/v1/engine/detokenizer.py +300 -0
  1310. vllm/v1/engine/exceptions.py +17 -0
  1311. vllm/v1/engine/llm_engine.py +332 -0
  1312. vllm/v1/engine/logprobs.py +201 -0
  1313. vllm/v1/engine/output_processor.py +558 -0
  1314. vllm/v1/engine/parallel_sampling.py +133 -0
  1315. vllm/v1/engine/processor.py +524 -0
  1316. vllm/v1/engine/utils.py +857 -0
  1317. vllm/v1/executor/__init__.py +0 -0
  1318. vllm/v1/executor/abstract.py +126 -0
  1319. vllm/v1/executor/multiproc_executor.py +683 -0
  1320. vllm/v1/executor/ray_distributed_executor.py +109 -0
  1321. vllm/v1/kv_cache_interface.py +275 -0
  1322. vllm/v1/metrics/__init__.py +0 -0
  1323. vllm/v1/metrics/loggers.py +717 -0
  1324. vllm/v1/metrics/prometheus.py +82 -0
  1325. vllm/v1/metrics/ray_wrappers.py +133 -0
  1326. vllm/v1/metrics/reader.py +246 -0
  1327. vllm/v1/metrics/stats.py +248 -0
  1328. vllm/v1/outputs.py +147 -0
  1329. vllm/v1/pool/__init__.py +0 -0
  1330. vllm/v1/pool/metadata.py +77 -0
  1331. vllm/v1/request.py +237 -0
  1332. vllm/v1/sample/__init__.py +0 -0
  1333. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1334. vllm/v1/sample/logits_processor/builtin.py +273 -0
  1335. vllm/v1/sample/logits_processor/interface.py +97 -0
  1336. vllm/v1/sample/logits_processor/state.py +161 -0
  1337. vllm/v1/sample/metadata.py +43 -0
  1338. vllm/v1/sample/ops/__init__.py +0 -0
  1339. vllm/v1/sample/ops/bad_words.py +39 -0
  1340. vllm/v1/sample/ops/logprobs.py +26 -0
  1341. vllm/v1/sample/ops/penalties.py +43 -0
  1342. vllm/v1/sample/ops/topk_topp_sampler.py +254 -0
  1343. vllm/v1/sample/rejection_sampler.py +623 -0
  1344. vllm/v1/sample/sampler.py +281 -0
  1345. vllm/v1/sample/tpu/__init__.py +0 -0
  1346. vllm/v1/sample/tpu/metadata.py +124 -0
  1347. vllm/v1/sample/tpu/sampler.py +213 -0
  1348. vllm/v1/serial_utils.py +395 -0
  1349. vllm/v1/spec_decode/__init__.py +0 -0
  1350. vllm/v1/spec_decode/eagle.py +740 -0
  1351. vllm/v1/spec_decode/medusa.py +66 -0
  1352. vllm/v1/spec_decode/metadata.py +62 -0
  1353. vllm/v1/spec_decode/metrics.py +191 -0
  1354. vllm/v1/spec_decode/ngram_proposer.py +157 -0
  1355. vllm/v1/spec_decode/utils.py +14 -0
  1356. vllm/v1/structured_output/__init__.py +297 -0
  1357. vllm/v1/structured_output/backend_guidance.py +245 -0
  1358. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1359. vllm/v1/structured_output/backend_outlines.py +320 -0
  1360. vllm/v1/structured_output/backend_types.py +134 -0
  1361. vllm/v1/structured_output/backend_xgrammar.py +323 -0
  1362. vllm/v1/structured_output/request.py +86 -0
  1363. vllm/v1/structured_output/utils.py +373 -0
  1364. vllm/v1/utils.py +382 -0
  1365. vllm/v1/worker/__init__.py +0 -0
  1366. vllm/v1/worker/block_table.py +221 -0
  1367. vllm/v1/worker/cpu_model_runner.py +163 -0
  1368. vllm/v1/worker/cpu_worker.py +183 -0
  1369. vllm/v1/worker/gpu_input_batch.py +821 -0
  1370. vllm/v1/worker/gpu_model_runner.py +3743 -0
  1371. vllm/v1/worker/gpu_worker.py +697 -0
  1372. vllm/v1/worker/kv_connector_model_runner_mixin.py +122 -0
  1373. vllm/v1/worker/lora_model_runner_mixin.py +192 -0
  1374. vllm/v1/worker/tpu_input_batch.py +585 -0
  1375. vllm/v1/worker/tpu_model_runner.py +1947 -0
  1376. vllm/v1/worker/tpu_worker.py +340 -0
  1377. vllm/v1/worker/utils.py +290 -0
  1378. vllm/v1/worker/worker_base.py +65 -0
  1379. vllm/v1/worker/xpu_model_runner.py +53 -0
  1380. vllm/v1/worker/xpu_worker.py +179 -0
  1381. vllm/version.py +41 -0
  1382. vllm/vllm_flash_attn/.gitkeep +0 -0
  1383. vllm/worker/__init__.py +0 -0
  1384. vllm/worker/cache_engine.py +145 -0
  1385. vllm/worker/enc_dec_model_runner.py +553 -0
  1386. vllm/worker/model_runner.py +2016 -0
  1387. vllm/worker/model_runner_base.py +307 -0
  1388. vllm/worker/utils.py +49 -0
  1389. vllm/worker/worker.py +670 -0
  1390. vllm/worker/worker_base.py +651 -0
  1391. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/METADATA +326 -0
  1392. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/RECORD +1395 -0
  1393. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/WHEEL +5 -0
  1394. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/entry_points.txt +5 -0
  1395. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1951 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ from abc import abstractmethod
5
+ from collections.abc import Iterable
6
+ from enum import Enum
7
+ from typing import Callable, Literal, Optional, Union, overload
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torch.nn.parameter import UninitializedParameter
12
+
13
+ import vllm.envs as envs
14
+ from vllm.config import get_current_vllm_config
15
+ from vllm.distributed import (get_dp_group, get_ep_group,
16
+ get_tensor_model_parallel_world_size,
17
+ tensor_model_parallel_all_reduce)
18
+ from vllm.distributed.eplb.eplb_state import EplbState
19
+ from vllm.forward_context import ForwardContext, get_forward_context
20
+ from vllm.logger import init_logger
21
+ from vllm.model_executor.custom_op import CustomOp
22
+ # yapf: disable
23
+ from vllm.model_executor.layers.fused_moe.config import (
24
+ FusedMoEConfig, FusedMoEParallelConfig)
25
+ # yapf: enable
26
+ from vllm.model_executor.layers.fused_moe.modular_kernel import (
27
+ FusedMoEActivationFormat, FusedMoEModularKernel,
28
+ FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
29
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
30
+ is_rocm_aiter_moe_enabled)
31
+ from vllm.model_executor.layers.fused_moe.routing_simulator import (
32
+ RoutingSimulator)
33
+ from vllm.model_executor.layers.quantization.base_config import (
34
+ QuantizationConfig, QuantizeMethodBase)
35
+ from vllm.model_executor.utils import set_weight_attrs
36
+ from vllm.platforms import current_platform
37
+ from vllm.platforms.interface import CpuArchEnum
38
+ from vllm.utils import (cdiv, direct_register_custom_op, has_deep_ep, has_pplx,
39
+ round_up)
40
+
41
+ if current_platform.is_cuda_alike():
42
+ from .fused_batched_moe import BatchedTritonExperts
43
+ from .fused_moe import TritonExperts, fused_experts
44
+ if has_pplx():
45
+ from .pplx_prepare_finalize import (PplxPrepareAndFinalize,
46
+ pplx_hidden_dim_scale_bytes)
47
+ if has_deep_ep():
48
+ from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
49
+ from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
50
+ DeepEPLLPrepareAndFinalize)
51
+ else:
52
+ fused_experts = None # type: ignore
53
+ FusedMoEPermuteExpertsUnpermute = None # type: ignore
54
+ FusedMoEPrepareAndFinalize = None # type: ignore
55
+ if is_rocm_aiter_moe_enabled():
56
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
57
+ rocm_aiter_grouped_topk as grouped_topk)
58
+ elif current_platform.is_cpu():
59
+ pass
60
+ else:
61
+ from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
62
+ if current_platform.is_tpu():
63
+ from .moe_pallas import fused_moe as fused_moe_pallas
64
+ else:
65
+ fused_moe_pallas = None # type: ignore
66
+
67
+ logger = init_logger(__name__)
68
+
69
+
70
+ class FusedMoeWeightScaleSupported(Enum):
71
+ TENSOR = "tensor"
72
+ CHANNEL = "channel"
73
+ GROUP = "group"
74
+ BLOCK = "block"
75
+
76
+
77
+ class FusedMoEMethodBase(QuantizeMethodBase):
78
+
79
+ # TODO(bnell): also pass quant_config?
80
+ def __init__(self, moe: FusedMoEConfig):
81
+ super().__init__()
82
+ self.moe = moe
83
+ self.fused_experts: Optional[Callable] = None
84
+ self.topk_indices_dtype = None
85
+
86
+ @abstractmethod
87
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
88
+ hidden_size: int, intermediate_size_per_partition: int,
89
+ params_dtype: torch.dtype, **extra_weight_attrs):
90
+ raise NotImplementedError
91
+
92
+ def uses_weight_scale_2_pattern(self) -> bool:
93
+ """
94
+ Returns True if this quantization method uses 'weight_scale_2' pattern
95
+ for per-tensor weight scales (e.g., FP4 variants), False otherwise.
96
+
97
+ This method should be overridden by subclasses that use the
98
+ 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
99
+ """
100
+ return False
101
+
102
+ @staticmethod
103
+ def _maybe_make_prepare_finalize(
104
+ moe: FusedMoEConfig, ) -> Optional[FusedMoEPrepareAndFinalize]:
105
+ all2all_manager = get_ep_group().device_communicator.all2all_manager
106
+ assert all2all_manager is not None
107
+
108
+ prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
109
+
110
+ assert not moe.use_flashinfer_cutlass_kernels, \
111
+ "Must be created in modelopt.py"
112
+
113
+ if moe.use_pplx_kernels:
114
+ hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
115
+ moe.max_num_tokens,
116
+ moe.hidden_dim,
117
+ moe.in_dtype,
118
+ moe.quant_dtype,
119
+ per_act_token_quant=moe.per_act_token_quant,
120
+ block_shape=moe.block_shape,
121
+ )
122
+
123
+ all_to_all_args = dict(
124
+ max_num_tokens=moe.max_num_tokens,
125
+ num_experts=moe.num_experts,
126
+ experts_per_token=moe.experts_per_token, # topk
127
+ rank=all2all_manager.rank,
128
+ world_size=all2all_manager.world_size,
129
+ # dp_size actually means tp_size, bug in pplx kernels
130
+ dp_size=all2all_manager.tp_group.world_size,
131
+ hidden_dim=moe.hidden_dim,
132
+ hidden_dim_bytes=hidden_dim_bytes,
133
+ hidden_dim_scale_bytes=hidden_scale_bytes,
134
+ )
135
+
136
+ num_dispatchers = (all2all_manager.world_size //
137
+ all2all_manager.tp_group.world_size)
138
+
139
+ # Intranode pplx a2a takes a group name while internode does not.
140
+ if not all2all_manager.internode:
141
+ all_to_all_args[
142
+ "group_name"] = all2all_manager.cpu_group.group_name
143
+
144
+ handle = all2all_manager.get_handle(all_to_all_args)
145
+
146
+ prepare_finalize = PplxPrepareAndFinalize(
147
+ handle,
148
+ max_num_tokens=moe.max_num_tokens,
149
+ num_local_experts=moe.num_local_experts,
150
+ num_dispatchers=num_dispatchers,
151
+ )
152
+ elif moe.use_deepep_ht_kernels:
153
+ assert moe.dp_size == all2all_manager.dp_world_size
154
+
155
+ all_to_all_args = dict()
156
+ handle = all2all_manager.get_handle(all_to_all_args)
157
+ prepare_finalize = DeepEPHTPrepareAndFinalize(
158
+ handle,
159
+ num_dispatchers=all2all_manager.world_size,
160
+ dp_size=all2all_manager.dp_world_size,
161
+ rank_expert_offset=all2all_manager.rank *
162
+ moe.num_local_experts,
163
+ )
164
+
165
+ elif moe.use_deepep_ll_kernels:
166
+ all_to_all_args = dict(
167
+ max_num_tokens_per_dp_rank=moe.max_num_tokens,
168
+ token_hidden_size=moe.hidden_dim,
169
+ num_ep_ranks=all2all_manager.world_size,
170
+ num_global_experts=moe.num_experts,
171
+ num_local_experts=moe.num_experts //
172
+ all2all_manager.world_size)
173
+ handle = all2all_manager.get_handle(all_to_all_args)
174
+
175
+ # Note : We may want to use FP8 dispatch even otherwise just to
176
+ # reduce datamovement
177
+ use_fp8_dispatch = (moe.quant_config is not None
178
+ and moe.quant_config.quant_dtype
179
+ == current_platform.fp8_dtype()
180
+ and moe.quant_config.block_shape
181
+ == DEEPEP_QUANT_BLOCK_SHAPE)
182
+
183
+ prepare_finalize = DeepEPLLPrepareAndFinalize(
184
+ handle,
185
+ max_tokens_per_rank=moe.max_num_tokens,
186
+ num_dispatchers=all2all_manager.world_size,
187
+ use_fp8_dispatch=use_fp8_dispatch,
188
+ )
189
+
190
+ return prepare_finalize
191
+
192
+ def maybe_make_prepare_finalize(
193
+ self,
194
+ moe: FusedMoEConfig,
195
+ ) -> Optional[FusedMoEPrepareAndFinalize]:
196
+ if moe.moe_parallel_config.use_all2all_kernels:
197
+ return FusedMoEMethodBase._maybe_make_prepare_finalize(moe)
198
+ else:
199
+ return None
200
+
201
+ # Note: init_prepare_finalize should only be called by
202
+ # prepare_communication_buffer_for_model.
203
+ def init_prepare_finalize(self, layer: torch.nn.Module):
204
+ assert self.moe is not None
205
+ prepare_finalize = self.maybe_make_prepare_finalize(self.moe)
206
+
207
+ if prepare_finalize is not None:
208
+ logger.debug("%s for %s(%s)", prepare_finalize.__class__.__name__,
209
+ self, id(self))
210
+ assert self.topk_indices_dtype is None
211
+ assert self.fused_experts is None, \
212
+ f"Attempt to override experts for {id(self)}!"
213
+ self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
214
+ experts = self.select_gemm_impl(prepare_finalize, self.moe, layer)
215
+ self.fused_experts = FusedMoEModularKernel(
216
+ prepare_finalize,
217
+ experts,
218
+ layer.shared_experts,
219
+ )
220
+
221
+ def select_gemm_impl(
222
+ self,
223
+ prepare_finalize: FusedMoEPrepareAndFinalize,
224
+ moe: FusedMoEConfig,
225
+ layer: torch.nn.Module,
226
+ ) -> FusedMoEPermuteExpertsUnpermute:
227
+ # based on the all2all implementation, select the appropriate
228
+ # gemm implementation
229
+ raise NotImplementedError(
230
+ f"{self.__class__.__name__} must select appropriate gemm "
231
+ "implementation based on the prepare_finalize")
232
+
233
+ @abstractmethod
234
+ def apply(
235
+ self,
236
+ layer: torch.nn.Module,
237
+ x: torch.Tensor,
238
+ router_logits: torch.Tensor,
239
+ top_k: int,
240
+ renormalize: bool,
241
+ use_grouped_topk: bool = False,
242
+ topk_group: Optional[int] = None,
243
+ num_expert_group: Optional[int] = None,
244
+ global_num_experts: int = -1,
245
+ expert_map: Optional[torch.Tensor] = None,
246
+ custom_routing_function: Optional[Callable] = None,
247
+ scoring_func: str = "softmax",
248
+ routed_scaling_factor: float = 1.0,
249
+ e_score_correction_bias: Optional[torch.Tensor] = None,
250
+ apply_router_weight_on_input: bool = False,
251
+ activation: str = "silu",
252
+ enable_eplb: bool = False,
253
+ expert_load_view: Optional[torch.Tensor] = None,
254
+ logical_to_physical_map: Optional[torch.Tensor] = None,
255
+ logical_replica_count: Optional[torch.Tensor] = None,
256
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
257
+ raise NotImplementedError
258
+
259
+
260
+ @CustomOp.register("unquantized_fused_moe")
261
+ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
262
+ """MoE method without quantization."""
263
+
264
+ def __init__(self, moe: FusedMoEConfig):
265
+ super().__init__(moe)
266
+ self.has_bias = self.moe.has_bias
267
+ self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
268
+ if self.rocm_aiter_moe_enabled:
269
+ from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
270
+ self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
271
+ else:
272
+ self.rocm_aiter_fused_experts = None # type: ignore
273
+
274
+ def select_gemm_impl(
275
+ self,
276
+ prepare_finalize: FusedMoEPrepareAndFinalize,
277
+ # TODO(bnell): Remove. Every layer should have an moe config object.
278
+ moe: FusedMoEConfig,
279
+ layer: torch.nn.Module,
280
+ ) -> FusedMoEPermuteExpertsUnpermute:
281
+ if (prepare_finalize.activation_format ==
282
+ FusedMoEActivationFormat.BatchedExperts):
283
+ logger.debug("BatchedTritonExperts %s", self.moe)
284
+ return BatchedTritonExperts(
285
+ max_num_tokens=self.moe.max_num_tokens,
286
+ num_dispatchers=prepare_finalize.num_dispatchers(),
287
+ )
288
+ else:
289
+ logger.debug("TritonExperts %s", self.moe)
290
+ return TritonExperts()
291
+
292
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
293
+ hidden_size: int, intermediate_size_per_partition: int,
294
+ params_dtype: torch.dtype, **extra_weight_attrs):
295
+ # Fused gate_up_proj (column parallel)
296
+ w13_weight = torch.nn.Parameter(torch.empty(
297
+ num_experts,
298
+ 2 * intermediate_size_per_partition,
299
+ hidden_size,
300
+ dtype=params_dtype),
301
+ requires_grad=False)
302
+ layer.register_parameter("w13_weight", w13_weight)
303
+ set_weight_attrs(w13_weight, extra_weight_attrs)
304
+ if self.has_bias:
305
+ w13_bias = torch.nn.Parameter(torch.zeros(
306
+ num_experts,
307
+ 2 * intermediate_size_per_partition,
308
+ dtype=params_dtype),
309
+ requires_grad=False)
310
+ layer.register_parameter("w13_bias", w13_bias)
311
+ set_weight_attrs(w13_bias, extra_weight_attrs)
312
+ # down_proj (row parallel)
313
+ w2_weight = torch.nn.Parameter(torch.empty(
314
+ num_experts,
315
+ hidden_size,
316
+ intermediate_size_per_partition,
317
+ dtype=params_dtype),
318
+ requires_grad=False)
319
+ layer.register_parameter("w2_weight", w2_weight)
320
+ set_weight_attrs(w2_weight, extra_weight_attrs)
321
+ if self.has_bias:
322
+ w2_bias = torch.nn.Parameter(torch.zeros(num_experts,
323
+ hidden_size,
324
+ dtype=params_dtype),
325
+ requires_grad=False)
326
+ layer.register_parameter("w2_bias", w2_bias)
327
+ set_weight_attrs(w2_bias, extra_weight_attrs)
328
+
329
+ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
330
+ # Pad the weight tensor. This is an optimization on ROCm platform, which
331
+ # can benefit from tensors located far enough from one another in memory
332
+ if (envs.VLLM_ROCM_MOE_PADDING and current_platform.is_rocm()
333
+ and weight.stride(-1) == 1
334
+ and (weight.stride(-2) * weight.element_size()) % 512 == 0):
335
+ num_pad = 256 // weight.element_size()
336
+ weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
337
+ torch.cuda.empty_cache()
338
+ return weight
339
+
340
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
341
+ super().process_weights_after_loading(layer)
342
+
343
+ # Padding the weight for better performance on ROCm
344
+ layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
345
+ layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
346
+ # Lazy import to avoid importing triton.
347
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
348
+ shuffle_weights)
349
+
350
+ if self.rocm_aiter_moe_enabled:
351
+ shuffled_w13, shuffled_w2 = shuffle_weights(
352
+ layer.w13_weight.data, layer.w2_weight.data)
353
+
354
+ layer.w13_weight.data = shuffled_w13
355
+ layer.w2_weight.data = shuffled_w2
356
+
357
+ if current_platform.is_xpu():
358
+ import intel_extension_for_pytorch as ipex
359
+ layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
360
+ layer.w13_weight,
361
+ layer.w2_weight,
362
+ use_prepack=True,
363
+ )
364
+ elif current_platform.is_cpu():
365
+ from vllm.model_executor.layers.fused_moe import cpu_fused_moe
366
+ if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
367
+ from vllm.model_executor.layers.utils import (
368
+ check_cpu_sgl_kernel)
369
+ dtype_w13 = layer.w13_weight.dtype
370
+ _, n_w13, k_w13 = layer.w13_weight.size()
371
+ dtype_w2 = layer.w2_weight.dtype
372
+ _, n_w2, k_w2 = layer.w2_weight.size()
373
+ if (envs.VLLM_CPU_SGL_KERNEL
374
+ and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
375
+ and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)):
376
+ packed_w13_weight = torch.ops._C.convert_weight_packed(
377
+ layer.w13_weight)
378
+ assert packed_w13_weight.size() == layer.w13_weight.size()
379
+ layer.w13_weight.copy_(packed_w13_weight)
380
+ del packed_w13_weight
381
+ packed_w2_weight = torch.ops._C.convert_weight_packed(
382
+ layer.w2_weight)
383
+ assert packed_w2_weight.size() == layer.w2_weight.size()
384
+ layer.w2_weight.copy_(packed_w2_weight)
385
+ layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
386
+ else:
387
+ layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
388
+ else:
389
+ layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
390
+
391
+ def apply(
392
+ self,
393
+ layer: torch.nn.Module,
394
+ x: torch.Tensor,
395
+ router_logits: torch.Tensor,
396
+ top_k: int,
397
+ renormalize: bool,
398
+ use_grouped_topk: bool = False,
399
+ topk_group: Optional[int] = None,
400
+ num_expert_group: Optional[int] = None,
401
+ global_num_experts: int = -1,
402
+ expert_map: Optional[torch.Tensor] = None,
403
+ custom_routing_function: Optional[Callable] = None,
404
+ scoring_func: str = "softmax",
405
+ routed_scaling_factor: float = 1.0,
406
+ e_score_correction_bias: Optional[torch.Tensor] = None,
407
+ apply_router_weight_on_input: bool = False,
408
+ activation: str = "silu",
409
+ enable_eplb: bool = False,
410
+ expert_load_view: Optional[torch.Tensor] = None,
411
+ logical_to_physical_map: Optional[torch.Tensor] = None,
412
+ logical_replica_count: Optional[torch.Tensor] = None,
413
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
414
+ if enable_eplb:
415
+ assert expert_load_view is not None
416
+ assert logical_to_physical_map is not None
417
+ assert logical_replica_count is not None
418
+ assert isinstance(layer, FusedMoE)
419
+
420
+ return self.forward(
421
+ x=x,
422
+ layer=layer,
423
+ router_logits=router_logits,
424
+ top_k=top_k,
425
+ renormalize=renormalize,
426
+ use_grouped_topk=use_grouped_topk,
427
+ topk_group=topk_group,
428
+ num_expert_group=num_expert_group,
429
+ global_num_experts=global_num_experts,
430
+ expert_map=expert_map,
431
+ custom_routing_function=custom_routing_function,
432
+ scoring_func=scoring_func,
433
+ routed_scaling_factor=routed_scaling_factor,
434
+ e_score_correction_bias=e_score_correction_bias,
435
+ activation=activation,
436
+ apply_router_weight_on_input=apply_router_weight_on_input,
437
+ enable_eplb=enable_eplb,
438
+ expert_load_view=expert_load_view,
439
+ logical_to_physical_map=logical_to_physical_map,
440
+ logical_replica_count=logical_replica_count,
441
+ )
442
+
443
+ def forward_cuda(
444
+ self,
445
+ layer: torch.nn.Module,
446
+ x: torch.Tensor,
447
+ use_grouped_topk: bool,
448
+ top_k: int,
449
+ router_logits: torch.Tensor,
450
+ renormalize: bool,
451
+ topk_group: Optional[int] = None,
452
+ num_expert_group: Optional[int] = None,
453
+ global_num_experts: int = -1,
454
+ expert_map: Optional[torch.Tensor] = None,
455
+ custom_routing_function: Optional[Callable] = None,
456
+ scoring_func: str = "softmax",
457
+ routed_scaling_factor: float = 1.0,
458
+ e_score_correction_bias: Optional[torch.Tensor] = None,
459
+ apply_router_weight_on_input: bool = False,
460
+ activation: str = "silu",
461
+ enable_eplb: bool = False,
462
+ expert_load_view: Optional[torch.Tensor] = None,
463
+ logical_to_physical_map: Optional[torch.Tensor] = None,
464
+ logical_replica_count: Optional[torch.Tensor] = None,
465
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
466
+
467
+ topk_weights, topk_ids = FusedMoE.select_experts(
468
+ hidden_states=x,
469
+ router_logits=router_logits,
470
+ use_grouped_topk=use_grouped_topk,
471
+ top_k=top_k,
472
+ renormalize=renormalize,
473
+ topk_group=topk_group,
474
+ num_expert_group=num_expert_group,
475
+ custom_routing_function=custom_routing_function,
476
+ scoring_func=scoring_func,
477
+ routed_scaling_factor=routed_scaling_factor,
478
+ e_score_correction_bias=e_score_correction_bias,
479
+ indices_type=self.topk_indices_dtype,
480
+ enable_eplb=enable_eplb,
481
+ expert_map=expert_map,
482
+ expert_load_view=expert_load_view,
483
+ logical_to_physical_map=logical_to_physical_map,
484
+ logical_replica_count=logical_replica_count)
485
+
486
+ if self.rocm_aiter_moe_enabled:
487
+ return self.rocm_aiter_fused_experts(
488
+ hidden_states=x,
489
+ w1=layer.w13_weight,
490
+ w2=layer.w2_weight,
491
+ topk_weights=topk_weights,
492
+ topk_ids=topk_ids,
493
+ expert_map=expert_map,
494
+ activation=activation,
495
+ apply_router_weight_on_input=apply_router_weight_on_input)
496
+ elif self.fused_experts is not None:
497
+ if self.has_bias:
498
+ raise ValueError(
499
+ "FusedMoEModularKernel does not support bias.")
500
+ return self.fused_experts(
501
+ hidden_states=x,
502
+ w1=layer.w13_weight,
503
+ w2=layer.w2_weight,
504
+ topk_weights=topk_weights,
505
+ topk_ids=topk_ids,
506
+ inplace=True,
507
+ activation=activation,
508
+ apply_router_weight_on_input=apply_router_weight_on_input,
509
+ global_num_experts=global_num_experts,
510
+ expert_map=expert_map,
511
+ )
512
+ else:
513
+ assert fused_experts is not None
514
+ return fused_experts(
515
+ hidden_states=x,
516
+ w1=layer.w13_weight,
517
+ w2=layer.w2_weight,
518
+ w1_bias=layer.w13_bias if self.has_bias else None,
519
+ w2_bias=layer.w2_bias if self.has_bias else None,
520
+ topk_weights=topk_weights,
521
+ topk_ids=topk_ids,
522
+ inplace=True,
523
+ activation=activation,
524
+ apply_router_weight_on_input=apply_router_weight_on_input,
525
+ global_num_experts=global_num_experts,
526
+ expert_map=expert_map,
527
+ )
528
+
529
+ def forward_cpu(
530
+ self,
531
+ layer: torch.nn.Module,
532
+ x: torch.Tensor,
533
+ use_grouped_topk: bool,
534
+ top_k: int,
535
+ router_logits: torch.Tensor,
536
+ renormalize: bool,
537
+ topk_group: Optional[int] = None,
538
+ num_expert_group: Optional[int] = None,
539
+ global_num_experts: int = -1,
540
+ expert_map: Optional[torch.Tensor] = None,
541
+ custom_routing_function: Optional[Callable] = None,
542
+ scoring_func: str = "softmax",
543
+ routed_scaling_factor: float = 1.0,
544
+ e_score_correction_bias: Optional[torch.Tensor] = None,
545
+ apply_router_weight_on_input: bool = False,
546
+ activation: str = "silu",
547
+ enable_eplb: bool = False,
548
+ expert_load_view: Optional[torch.Tensor] = None,
549
+ logical_to_physical_map: Optional[torch.Tensor] = None,
550
+ logical_replica_count: Optional[torch.Tensor] = None,
551
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
552
+ if enable_eplb is not False or expert_load_view is not None or \
553
+ logical_to_physical_map is not None or \
554
+ logical_replica_count is not None:
555
+ raise NotImplementedError("Expert load balancing is not supported "
556
+ "for CPU.")
557
+ return layer.cpu_fused_moe(
558
+ layer,
559
+ x,
560
+ use_grouped_topk,
561
+ top_k,
562
+ router_logits,
563
+ renormalize,
564
+ topk_group,
565
+ num_expert_group,
566
+ global_num_experts,
567
+ expert_map,
568
+ custom_routing_function,
569
+ scoring_func,
570
+ routed_scaling_factor,
571
+ e_score_correction_bias,
572
+ apply_router_weight_on_input,
573
+ activation,
574
+ )
575
+
576
+ def forward_xpu(
577
+ self,
578
+ layer: torch.nn.Module,
579
+ x: torch.Tensor,
580
+ use_grouped_topk: bool,
581
+ top_k: int,
582
+ router_logits: torch.Tensor,
583
+ renormalize: bool,
584
+ topk_group: Optional[int] = None,
585
+ num_expert_group: Optional[int] = None,
586
+ global_num_experts: int = -1,
587
+ expert_map: Optional[torch.Tensor] = None,
588
+ custom_routing_function: Optional[Callable] = None,
589
+ scoring_func: str = "softmax",
590
+ routed_scaling_factor: float = 1.0,
591
+ e_score_correction_bias: Optional[torch.Tensor] = None,
592
+ apply_router_weight_on_input: bool = False,
593
+ activation: str = "silu",
594
+ enable_eplb: bool = False,
595
+ expert_load_view: Optional[torch.Tensor] = None,
596
+ logical_to_physical_map: Optional[torch.Tensor] = None,
597
+ logical_replica_count: Optional[torch.Tensor] = None,
598
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
599
+ if enable_eplb is not False or expert_load_view is not None or \
600
+ logical_to_physical_map is not None or \
601
+ logical_replica_count is not None:
602
+ raise NotImplementedError("Expert load balancing is not supported "
603
+ "for XPU.")
604
+ assert custom_routing_function is None
605
+ return layer.ipex_fusion(
606
+ x,
607
+ use_grouped_topk,
608
+ top_k,
609
+ router_logits,
610
+ renormalize,
611
+ topk_group,
612
+ num_expert_group,
613
+ )
614
+
615
+ def forward_tpu(
616
+ self,
617
+ layer: torch.nn.Module,
618
+ x: torch.Tensor,
619
+ use_grouped_topk: bool,
620
+ top_k: int,
621
+ router_logits: torch.Tensor,
622
+ renormalize: bool,
623
+ topk_group: Optional[int] = None,
624
+ num_expert_group: Optional[int] = None,
625
+ global_num_experts: int = -1,
626
+ expert_map: Optional[torch.Tensor] = None,
627
+ custom_routing_function: Optional[Callable] = None,
628
+ scoring_func: str = "softmax",
629
+ routed_scaling_factor: float = 1.0,
630
+ e_score_correction_bias: Optional[torch.Tensor] = None,
631
+ apply_router_weight_on_input: bool = False,
632
+ activation: str = "silu",
633
+ enable_eplb: bool = False,
634
+ expert_load_view: Optional[torch.Tensor] = None,
635
+ logical_to_physical_map: Optional[torch.Tensor] = None,
636
+ logical_replica_count: Optional[torch.Tensor] = None,
637
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
638
+ assert not use_grouped_topk
639
+ assert num_expert_group is None
640
+ assert topk_group is None
641
+ assert custom_routing_function is None
642
+ assert apply_router_weight_on_input is False
643
+ if scoring_func != "softmax":
644
+ raise NotImplementedError(
645
+ "Only softmax scoring function is supported for TPU.")
646
+ if e_score_correction_bias is not None:
647
+ raise NotImplementedError(
648
+ "Expert score correction bias is not supported for TPU.")
649
+ assert activation == "silu", f"{activation} is not supported for TPU."
650
+ assert routed_scaling_factor == 1.0, \
651
+ f"routed_scaling_factor {routed_scaling_factor} is not supported " \
652
+ f"for TPU."
653
+ if enable_eplb is not False or expert_load_view is not None or \
654
+ logical_to_physical_map is not None or \
655
+ logical_replica_count is not None:
656
+ raise NotImplementedError("Expert load balancing is not supported "
657
+ "for TPU.")
658
+ return fused_moe_pallas(hidden_states=x,
659
+ w1=layer.w13_weight,
660
+ w2=layer.w2_weight,
661
+ topk=top_k,
662
+ gating_output=router_logits,
663
+ global_num_experts=global_num_experts,
664
+ expert_map=expert_map,
665
+ renormalize=renormalize)
666
+
667
+ if current_platform.is_tpu():
668
+ forward_native = forward_tpu
669
+ elif current_platform.is_cpu():
670
+ forward_native = forward_cpu
671
+ elif current_platform.is_xpu():
672
+ forward_native = forward_xpu
673
+ else:
674
+ forward_native = forward_cuda
675
+
676
+
677
+ def determine_expert_map(
678
+ ep_size: int, ep_rank: int,
679
+ global_num_experts: int) -> tuple[int, Optional[torch.Tensor]]:
680
+ """
681
+ Calculates how many experts should be assigned to each rank for EP and
682
+ creates a mapping from global to local expert index. Experts are
683
+ distributed evenly across ranks. Any remaining are assigned to the
684
+ last rank.
685
+
686
+ Args:
687
+ ep_size (int): The size of the expert parallel group
688
+ global_num_experts (int): The total number of experts in the model.
689
+
690
+ Returns:
691
+ tuple[int, Optional[torch.Tensor]]: A tuple containing:
692
+ - local_num_experts (int): The number of experts assigned
693
+ to the current rank.
694
+ - expert_map (Optional[torch.Tensor]): A tensor of shape
695
+ (global_num_experts,) mapping from global to local index.
696
+ Contains -1 for experts not assigned to the current rank.
697
+ Returns None if ep_size is 1.
698
+ """
699
+ assert ep_size > 0
700
+ if ep_size == 1:
701
+ return (global_num_experts, None)
702
+
703
+ # Distribute experts as evenly as possible to each rank.
704
+ base_experts = global_num_experts // ep_size
705
+ remainder = global_num_experts % ep_size
706
+ if ep_rank < remainder:
707
+ local_num_experts = base_experts + 1
708
+ else:
709
+ local_num_experts = base_experts
710
+
711
+ # Create a tensor of size num_experts filled with -1
712
+ expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
713
+ # Create an expert map for the local experts
714
+ start_idx = ep_rank * base_experts + min(ep_rank, remainder)
715
+ expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
716
+ 0, local_num_experts, dtype=torch.int32)
717
+ return (local_num_experts, expert_map)
718
+
719
+
720
+ def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
721
+ """
722
+ Compresses the expert map by removing any -1 entries.
723
+
724
+ Args:
725
+ expert_map (torch.Tensor): A tensor of shape (global_num_experts,)
726
+ mapping from global to local index. Contains -1 for experts not
727
+ assigned to the current rank.
728
+
729
+ Returns:
730
+ str: A string mapping from local to global index.
731
+ Using str to support hashing for logging once only.
732
+ """
733
+ global_indices = torch.where(expert_map != -1)[0]
734
+ local_indices = expert_map[global_indices]
735
+ return ", ".join(
736
+ f"{local_index.item()}->{global_index.item()}"
737
+ for local_index, global_index in zip(local_indices, global_indices))
738
+
739
+
740
+ @CustomOp.register("fused_moe")
741
+ class FusedMoE(CustomOp):
742
+ """FusedMoE layer for MoE models.
743
+
744
+ This layer contains both MergedColumnParallel weights (gate_up_proj /
745
+ w13) and RowParallelLinear weights (down_proj/ w2).
746
+
747
+ Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
748
+ copy that naming convention here and handle any remapping in the
749
+ load_weights function in each model implementation.
750
+
751
+ Args:
752
+ num_experts: Number of experts in the model
753
+ top_k: Number of experts selected for each token
754
+ hidden_size: Input hidden state size of the transformer
755
+ intermediate_size: Intermediate size of the experts
756
+ params_dtype: Data type for the parameters.
757
+ reduce_results: Whether to all all_reduce on the output of the layer
758
+ renormalize: Whether to renormalize the logits in the fused_moe kernel
759
+ quant_config: Quantization configure.
760
+ enable_eplb: Whether to enable expert parallelism load balancer.
761
+ """
762
+
763
+ def __init__(
764
+ self,
765
+ num_experts: int, # Global number of experts
766
+ top_k: int,
767
+ hidden_size: int,
768
+ intermediate_size: int,
769
+ params_dtype: Optional[torch.dtype] = None,
770
+ reduce_results: bool = False,
771
+ renormalize: bool = True,
772
+ use_grouped_topk: bool = False,
773
+ num_expert_group: Optional[int] = None,
774
+ topk_group: Optional[int] = None,
775
+ quant_config: Optional[QuantizationConfig] = None,
776
+ tp_size: Optional[int] = None,
777
+ ep_size: Optional[int] = None,
778
+ dp_size: Optional[int] = None,
779
+ prefix: str = "",
780
+ custom_routing_function: Optional[Callable] = None,
781
+ scoring_func: str = "softmax",
782
+ routed_scaling_factor: float = 1.0,
783
+ e_score_correction_bias: Optional[torch.Tensor] = None,
784
+ apply_router_weight_on_input: bool = False,
785
+ activation: str = "silu",
786
+ enable_eplb: bool = False,
787
+ num_redundant_experts: int = 0,
788
+ has_bias: bool = False,
789
+ is_sequence_parallel=False,
790
+ ):
791
+ super().__init__()
792
+ if params_dtype is None:
793
+ params_dtype = torch.get_default_dtype()
794
+ self.params_dtype = params_dtype
795
+
796
+ tp_size_ = (tp_size if tp_size is not None else
797
+ get_tensor_model_parallel_world_size())
798
+ dp_size_ = (dp_size
799
+ if dp_size is not None else get_dp_group().world_size)
800
+
801
+ self.is_sequence_parallel = is_sequence_parallel
802
+ if self.is_sequence_parallel:
803
+ self.sp_size = tp_size_
804
+
805
+ vllm_config = get_current_vllm_config()
806
+ self.moe_parallel_config: FusedMoEParallelConfig = (
807
+ FusedMoEParallelConfig.make(
808
+ tp_size_=tp_size_,
809
+ dp_size_=dp_size_,
810
+ vllm_parallel_config=vllm_config.parallel_config))
811
+
812
+ self.global_num_experts = num_experts + num_redundant_experts
813
+
814
+ # we are padding globally so EP buffer allocation works
815
+ if quant_config and quant_config.get_name() == "mxfp4":
816
+ from vllm.model_executor.layers.quantization.mxfp4 import (
817
+ Mxfp4Backend, get_mxfp4_backend)
818
+ current_mxfp4_backend = get_mxfp4_backend()
819
+ if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
820
+ or current_mxfp4_backend
821
+ == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS):
822
+ hidden_size = round_up(hidden_size, 128)
823
+ elif (current_platform.is_rocm() or current_mxfp4_backend
824
+ == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM or
825
+ current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
826
+ hidden_size = round_up(hidden_size, 256)
827
+
828
+ # For smuggling this layer into the fused moe custom op
829
+ compilation_config = vllm_config.compilation_config
830
+ if prefix in compilation_config.static_forward_context:
831
+ raise ValueError("Duplicate layer name: {}".format(prefix))
832
+ compilation_config.static_forward_context[prefix] = self
833
+ self.layer_name = prefix
834
+
835
+ self.enable_eplb = enable_eplb
836
+ self.expert_load_view: Optional[torch.Tensor] = None
837
+ self.logical_to_physical_map: Optional[torch.Tensor] = None
838
+ self.logical_replica_count: Optional[torch.Tensor] = None
839
+
840
+ # Determine expert maps
841
+ if self.use_ep:
842
+ if self.enable_eplb:
843
+ assert self.global_num_experts % self.ep_size == 0, \
844
+ "EPLB currently only supports even distribution of " \
845
+ "experts across ranks."
846
+ else:
847
+ assert num_redundant_experts == 0, \
848
+ "Redundant experts are only supported with EPLB."
849
+ self.local_num_experts, self.expert_map = determine_expert_map(
850
+ ep_size=self.ep_size,
851
+ ep_rank=self.ep_rank,
852
+ global_num_experts=self.global_num_experts)
853
+ logger.info_once(
854
+ "[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
855
+ " number of experts: %s/%s. Experts local to global index map:"
856
+ " %s.", self.ep_rank, self.ep_size, self.local_num_experts,
857
+ self.global_num_experts,
858
+ get_compressed_expert_map(self.expert_map))
859
+ else:
860
+ self.local_num_experts, self.expert_map = (self.global_num_experts,
861
+ None)
862
+
863
+ self.top_k = top_k
864
+
865
+ assert intermediate_size % self.tp_size == 0
866
+ self.hidden_size = hidden_size
867
+ self.intermediate_size_per_partition = intermediate_size // self.tp_size
868
+ self.reduce_results = reduce_results
869
+ self.renormalize = renormalize
870
+ self.use_grouped_topk = use_grouped_topk
871
+ if self.use_grouped_topk:
872
+ assert num_expert_group is not None and topk_group is not None
873
+ self.num_expert_group = num_expert_group
874
+ self.topk_group = topk_group
875
+ self.custom_routing_function = custom_routing_function
876
+ self.scoring_func = scoring_func
877
+ self.routed_scaling_factor = routed_scaling_factor
878
+ self.e_score_correction_bias = e_score_correction_bias
879
+ self.apply_router_weight_on_input = apply_router_weight_on_input
880
+ self.activation = activation
881
+
882
+ if self.scoring_func != "softmax" and not self.use_grouped_topk:
883
+ raise ValueError("Only softmax scoring function is supported for "
884
+ "non-grouped topk.")
885
+
886
+ if vllm_config.model_config is not None:
887
+ model_dtype = vllm_config.model_config.dtype
888
+ else:
889
+ # TODO (bnell): This is a hack to get test_mixtral_moe to work
890
+ # since model_config is not set in the pytest test.
891
+ model_dtype = params_dtype
892
+
893
+ moe = FusedMoEConfig.make(num_experts=self.global_num_experts,
894
+ experts_per_token=top_k,
895
+ hidden_dim=hidden_size,
896
+ num_local_experts=self.local_num_experts,
897
+ moe_parallel_config=self.moe_parallel_config,
898
+ in_dtype=model_dtype,
899
+ max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
900
+ quant_config=quant_config,
901
+ has_bias=has_bias)
902
+ self.moe_config = moe
903
+ self.quant_config = quant_config
904
+
905
+ # Note: get_quant_method will look at the layer's local_num_experts
906
+ # for heuristic purposes, so it must be initialized first.
907
+ quant_method: Optional[QuantizeMethodBase] = None
908
+ quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
909
+ else quant_config.get_quant_method(self, prefix))
910
+
911
+ assert quant_method is not None
912
+ assert isinstance(quant_method, FusedMoEMethodBase)
913
+ self.quant_method = quant_method
914
+
915
+ if self.enable_eplb:
916
+ from vllm.model_executor.layers.quantization.fp8 import (
917
+ Fp8MoEMethod)
918
+ if not isinstance(quant_method,
919
+ (Fp8MoEMethod, UnquantizedFusedMoEMethod)):
920
+ # TODO: Add support for additional quantization methods.
921
+ # The implementation for other quantization methods does not
922
+ # contain essential differences, but the current quant API
923
+ # design causes duplicated work when extending to new
924
+ # quantization methods, so I'm leaving it for now.
925
+ # If you plan to add support for more quantization methods,
926
+ # please refer to the implementation in `Fp8MoEMethod`.
927
+ raise NotImplementedError("EPLB is only supported for FP8 "
928
+ "quantization for now.")
929
+
930
+ moe_quant_params = {
931
+ "num_experts": self.local_num_experts,
932
+ "hidden_size": hidden_size,
933
+ "intermediate_size_per_partition":
934
+ self.intermediate_size_per_partition,
935
+ "params_dtype": params_dtype,
936
+ "weight_loader": self.weight_loader,
937
+ }
938
+ # need full intermediate size pre-sharding for WNA16 act order
939
+ if (self.quant_method.__class__.__name__
940
+ in ("GPTQMarlinMoEMethod",
941
+ "CompressedTensorsWNA16MarlinMoEMethod",
942
+ "CompressedTensorsWNA16MoEMethod")):
943
+ moe_quant_params["intermediate_size_full"] = intermediate_size
944
+
945
+ self.quant_method.create_weights(layer=self, **moe_quant_params)
946
+
947
+ # Chunked all2all staging tensor
948
+ self.batched_hidden_states: Optional[torch.Tensor] = None
949
+ self.batched_router_logits: Optional[torch.Tensor] = None
950
+ if (self.moe_parallel_config.use_pplx_kernels
951
+ or self.moe_parallel_config.use_deepep_ll_kernels
952
+ or self.moe_config.use_flashinfer_cutlass_kernels):
953
+ self.batched_hidden_states = torch.zeros(
954
+ (moe.max_num_tokens, self.hidden_size),
955
+ dtype=moe.in_dtype,
956
+ device=torch.cuda.current_device())
957
+
958
+ # Note here we use `num_experts` which is logical expert count
959
+ self.batched_router_logits = torch.zeros(
960
+ (moe.max_num_tokens, num_experts),
961
+ dtype=moe.in_dtype,
962
+ device=torch.cuda.current_device())
963
+
964
+ @property
965
+ def shared_experts(self) -> Optional[torch.nn.Module]:
966
+ return None
967
+
968
+ @property
969
+ def tp_size(self):
970
+ return self.moe_parallel_config.tp_size
971
+
972
+ @property
973
+ def dp_size(self):
974
+ return self.moe_parallel_config.dp_size
975
+
976
+ @property
977
+ def ep_size(self):
978
+ return self.moe_parallel_config.ep_size
979
+
980
+ @property
981
+ def tp_rank(self):
982
+ return self.moe_parallel_config.tp_rank
983
+
984
+ @property
985
+ def dp_rank(self):
986
+ return self.moe_parallel_config.dp_rank
987
+
988
+ @property
989
+ def ep_rank(self):
990
+ return self.moe_parallel_config.ep_rank
991
+
992
+ @property
993
+ def use_ep(self):
994
+ return self.moe_parallel_config.use_ep
995
+
996
+ @property
997
+ def use_pplx_kernels(self):
998
+ return self.moe_parallel_config.use_pplx_kernels
999
+
1000
+ @property
1001
+ def use_deepep_ht_kernels(self):
1002
+ return self.moe_parallel_config.use_deepep_ht_kernels
1003
+
1004
+ @property
1005
+ def use_deepep_ll_kernels(self):
1006
+ return self.moe_parallel_config.use_deepep_ll_kernels
1007
+
1008
+ @property
1009
+ def use_flashinfer_cutlass_kernels(self):
1010
+ return self.moe_config.use_flashinfer_cutlass_kernels
1011
+
1012
+ def update_expert_map(self):
1013
+ # ep_size and ep_rank should already be updated
1014
+ assert self.expert_map is not None
1015
+ with self.expert_map.device:
1016
+ self.local_num_experts, self.expert_map = determine_expert_map(
1017
+ ep_size=self.ep_size,
1018
+ ep_rank=self.ep_rank,
1019
+ global_num_experts=self.global_num_experts)
1020
+
1021
+ def _load_per_tensor_weight_scale(self, shard_id: str,
1022
+ param: torch.nn.Parameter,
1023
+ loaded_weight: torch.Tensor,
1024
+ expert_id: int):
1025
+ param_data = param.data
1026
+ # for per tensor weight quantization
1027
+ if shard_id in ("w1", "w3"):
1028
+ # We have to keep the weight scales of w1 and w3 because
1029
+ # we need to re-quantize w1/w3 weights after weight loading.
1030
+ idx = 0 if shard_id == "w1" else 1
1031
+ param_data[expert_id][idx] = loaded_weight
1032
+ # If we are in the row parallel case (down_proj)
1033
+ elif shard_id == "w2":
1034
+ param_data[expert_id] = loaded_weight
1035
+
1036
+ def _load_combined_w13_weight_scale(self, shard_dim: int,
1037
+ loaded_weight: torch.Tensor,
1038
+ param: torch.Tensor, tp_rank: int):
1039
+ """
1040
+ Load w13 weight scales assuming that w1 weight scales and w3 weight
1041
+ scales are stored in the same loaded_weight tensor.
1042
+ """
1043
+ shard_size = param.shape[shard_dim]
1044
+ loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
1045
+ shard_size)
1046
+ param.copy_(loaded_weight)
1047
+
1048
+ def _load_model_weight_or_group_weight_scale(self,
1049
+ shard_dim: int,
1050
+ expert_data: torch.Tensor,
1051
+ shard_id: str,
1052
+ loaded_weight: torch.Tensor,
1053
+ tp_rank: int,
1054
+ load_full_w2: bool = False):
1055
+ """
1056
+ Load grouped weight scales for group quantization or model weights
1057
+ :param shard_dim: dimension to shard
1058
+ :param expert_data: parameter for a particular expert
1059
+ :param shard_id: either w1, w2, or w3
1060
+ :param loaded_weight: checkpoint weight to load into the param
1061
+ :param tp_rank: tensor parallel rank
1062
+ :param load_full_w2: whether or not the w2 loaded should be sharded.
1063
+ """
1064
+ if shard_id == "w2":
1065
+ # In the case where we have actorder/g_idx, we do not partition the
1066
+ # w2 scales, as indicated by `load_full` argument, for all tp cases
1067
+ self._load_w2(shard_dim=shard_dim,
1068
+ loaded_weight=loaded_weight,
1069
+ expert_data=expert_data,
1070
+ tp_rank=tp_rank,
1071
+ load_full=load_full_w2)
1072
+ elif shard_id in ("w1", "w3"):
1073
+ self._load_w13(shard_id=shard_id,
1074
+ shard_dim=shard_dim,
1075
+ loaded_weight=loaded_weight,
1076
+ expert_data=expert_data,
1077
+ tp_rank=tp_rank)
1078
+
1079
+ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
1080
+ shard_dim: int, shard_id: str,
1081
+ loaded_weight: torch.Tensor,
1082
+ tp_rank: int):
1083
+ # for per channel weight quantization
1084
+ if shard_id == "w2":
1085
+ expert_data.copy_(loaded_weight)
1086
+ elif shard_id in ("w1", "w3"):
1087
+ self._load_w13(shard_id=shard_id,
1088
+ shard_dim=shard_dim,
1089
+ loaded_weight=loaded_weight,
1090
+ expert_data=expert_data,
1091
+ tp_rank=tp_rank)
1092
+
1093
+ def _load_w13(self,
1094
+ expert_data: torch.Tensor,
1095
+ shard_dim: int,
1096
+ shard_id: str,
1097
+ loaded_weight: torch.Tensor,
1098
+ tp_rank: int,
1099
+ load_full: bool = False):
1100
+
1101
+ # Index the loaded weight for tp sharding.
1102
+ # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
1103
+ shard_size = expert_data.shape[shard_dim] // 2
1104
+ if not load_full:
1105
+ loaded_weight = loaded_weight.narrow(shard_dim,
1106
+ shard_size * tp_rank,
1107
+ shard_size)
1108
+ # Narrow parameter and load.
1109
+ # w1, gate_proj: Load into first logical weight of w13.
1110
+ if shard_id == "w1":
1111
+ expert_data = expert_data.narrow(shard_dim, 0, shard_size)
1112
+ # w3, up_proj: Load into second logical weight of w13.
1113
+ else:
1114
+ assert shard_id == "w3"
1115
+ expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
1116
+ expert_data.copy_(loaded_weight)
1117
+
1118
+ def _load_w2(self,
1119
+ expert_data: torch.Tensor,
1120
+ shard_dim: int,
1121
+ loaded_weight: torch.Tensor,
1122
+ tp_rank: int,
1123
+ load_full: bool = False):
1124
+
1125
+ # Index the loaded weight for tp sharding.
1126
+ # down_proj: "RowParallel" so tp sharding on input_dim
1127
+ # Narrow parameter and load.
1128
+ shard_size = expert_data.shape[shard_dim]
1129
+ if not load_full:
1130
+ loaded_weight = loaded_weight.narrow(shard_dim,
1131
+ shard_size * tp_rank,
1132
+ shard_size)
1133
+ # w2, down_proj: Load into only logical weight of w2.
1134
+ expert_data.copy_(loaded_weight)
1135
+
1136
+ def _load_single_value(self, param: torch.nn.Parameter,
1137
+ loaded_weight: torch.Tensor, expert_id: int):
1138
+ param_data = param.data
1139
+
1140
+ # Input scales can be loaded directly and should be equal.
1141
+ param_data[expert_id] = loaded_weight
1142
+
1143
+ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
1144
+ shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
1145
+
1146
+ if shard_id == "w2":
1147
+ self._load_w2(shard_dim=shard_dim,
1148
+ loaded_weight=loaded_weight,
1149
+ expert_data=expert_data,
1150
+ tp_rank=tp_rank)
1151
+ else:
1152
+ assert shard_id in ("w1", "w3")
1153
+ expert_data.copy_(loaded_weight)
1154
+
1155
+ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
1156
+ if self.expert_map is None:
1157
+ return expert_id
1158
+ return self.expert_map[expert_id].item()
1159
+
1160
+ @overload
1161
+ def weight_loader(self, param: torch.nn.Parameter,
1162
+ loaded_weight: torch.Tensor, weight_name: str,
1163
+ shard_id: str, expert_id: int,
1164
+ return_success: Literal[False]) -> None:
1165
+ ...
1166
+
1167
+ @overload
1168
+ def weight_loader(self, param: torch.nn.Parameter,
1169
+ loaded_weight: torch.Tensor, weight_name: str,
1170
+ shard_id: str, expert_id: int,
1171
+ return_success: Literal[True]) -> bool:
1172
+ ...
1173
+
1174
+ def weight_loader(self,
1175
+ param: torch.nn.Parameter,
1176
+ loaded_weight: torch.Tensor,
1177
+ weight_name: str,
1178
+ shard_id: str,
1179
+ expert_id: int,
1180
+ return_success: bool = False) -> Optional[bool]:
1181
+
1182
+ if self.quant_config and self.quant_config.get_name() == "mxfp4":
1183
+ # (FIXME) for gpt-oss all experts are combined
1184
+ if "bias" in weight_name:
1185
+ dim1 = loaded_weight.shape[1]
1186
+ param.data[:, :dim1].copy_(loaded_weight)
1187
+ else:
1188
+ dim1 = loaded_weight.shape[1]
1189
+ dim2 = loaded_weight.shape[2]
1190
+ param.data[:, :dim1, :dim2].copy_(loaded_weight)
1191
+ return True if return_success else None
1192
+
1193
+ expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
1194
+ if expert_id == -1:
1195
+ # Failed to load this param since it's not local to this rank
1196
+ return False if return_success else None
1197
+ # Hereafter, `expert_id` is local physical id
1198
+
1199
+ quant_method_name = self.quant_method.__class__.__name__
1200
+ # compressed-tensors checkpoints with packed weights are stored flipped
1201
+ # TODO (mgoin): check self.quant_method.quant_config.quant_format
1202
+ # against known CompressionFormat enum values that have this quality
1203
+ if self.quant_method.__class__.__name__ in (
1204
+ "CompressedTensorsWNA16MarlinMoEMethod",
1205
+ "CompressedTensorsWNA16MoEMethod"):
1206
+ loaded_weight = loaded_weight.t().contiguous()
1207
+
1208
+ if shard_id not in ("w1", "w2", "w3"):
1209
+ raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
1210
+ f"got {shard_id}.")
1211
+
1212
+ # Fetch the dim to shard the parameter/loaded weight
1213
+ # based on the shard id. This will be whatever
1214
+ # dimension intermediate_size_per_partition is used.
1215
+ SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
1216
+
1217
+ is_gguf_weight = getattr(param, "is_gguf_weight", False)
1218
+ is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
1219
+ if is_gguf_weight_type:
1220
+ param.weight_type = loaded_weight.item()
1221
+ param.data.copy_(loaded_weight)
1222
+ return True if return_success else None
1223
+
1224
+ # Case for BitsAndBytes
1225
+ use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
1226
+ if use_bitsandbytes_4bit:
1227
+ shard_dim = 0
1228
+
1229
+ expert_data = param.data[expert_id]
1230
+ if shard_id == "w2":
1231
+ expert_data.copy_(loaded_weight)
1232
+ elif shard_id in ("w1", "w3"):
1233
+ # BNB inflight quantization has already sharded the weights
1234
+ full_load = True
1235
+ self._load_w13(
1236
+ shard_id=shard_id,
1237
+ shard_dim=shard_dim,
1238
+ loaded_weight=loaded_weight,
1239
+ expert_data=expert_data,
1240
+ tp_rank=self.tp_rank,
1241
+ load_full=full_load,
1242
+ )
1243
+ return True if return_success else None
1244
+
1245
+ # is_transposed: if the dim to shard the weight
1246
+ # should be flipped. Required by GPTQ, compressed-tensors
1247
+ # should be whatever dimension intermediate_size_per_partition is
1248
+ is_transposed = getattr(param, "is_transposed", False)
1249
+ shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
1250
+ if is_transposed:
1251
+ shard_dim = int(not shard_dim)
1252
+
1253
+ full_load = len(loaded_weight.shape) == 3
1254
+ if full_load:
1255
+ shard_dim += 1
1256
+
1257
+ # Materialize GGUF UninitializedParameter
1258
+ if is_gguf_weight and isinstance(param, UninitializedParameter):
1259
+ final_shape = list(loaded_weight.shape)
1260
+ if shard_id in ["w1", "w3"]:
1261
+ final_shape[1] *= 2
1262
+ final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
1263
+ param.materialize(final_shape, dtype=loaded_weight.dtype)
1264
+
1265
+ expert_data = param.data if full_load else param.data[expert_id]
1266
+
1267
+ # Case input scale: input_scale loading is only supported for fp8
1268
+ if "input_scale" in weight_name:
1269
+ # this is needed for compressed-tensors only
1270
+ loaded_weight = loaded_weight.to(param.data.device)
1271
+
1272
+ if ("compressed" in quant_method_name.lower()
1273
+ and param.data[expert_id] != 1
1274
+ and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
1275
+ raise ValueError(
1276
+ "input_scales of w1 and w3 of a layer "
1277
+ f"must be equal. But got {param.data[expert_id]} "
1278
+ f"vs. {loaded_weight}")
1279
+
1280
+ self._load_single_value(param=param,
1281
+ loaded_weight=loaded_weight,
1282
+ expert_id=expert_id)
1283
+ return True if return_success else None
1284
+
1285
+ # Case g_idx
1286
+ if "g_idx" in weight_name:
1287
+ self._load_g_idx(shard_dim=0,
1288
+ shard_id=shard_id,
1289
+ loaded_weight=loaded_weight,
1290
+ expert_data=expert_data,
1291
+ tp_rank=self.tp_rank)
1292
+ return True if return_success else None
1293
+
1294
+ # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
1295
+ if "ModelOpt" in quant_method_name:
1296
+ # Determine per-tensor weight scale patterns based on variant
1297
+ # Use the dedicated method instead of brittle string matching
1298
+ uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern(
1299
+ )
1300
+
1301
+ # Call _load_per_tensor_weight_scale() to load per-tensor (scalar)
1302
+ # weights scales.
1303
+ # Input scales are always per-tensor.
1304
+ # Weight scales: FP4 uses "weight_scale_2" and FP8 uses
1305
+ # "weight_scale" for per-tensor scales.
1306
+ is_per_tensor = ("weight_scale_2" in weight_name
1307
+ if uses_weight_scale_2 else "weight_scale"
1308
+ in weight_name) or "input_scale" in weight_name
1309
+ if is_per_tensor:
1310
+ self._load_per_tensor_weight_scale(
1311
+ shard_id=shard_id,
1312
+ param=param,
1313
+ loaded_weight=loaded_weight,
1314
+ expert_id=expert_id,
1315
+ )
1316
+ return True if return_success else None
1317
+
1318
+ # If the weight is w13_weight_scale and w13_weight_scales are
1319
+ # combined into single loaded_weight, call
1320
+ # _load_combined_w13_weight_scale() to load it.
1321
+ # This is checked by comparing the hidden_out dims of the
1322
+ # loaded_weight and the param.
1323
+ if "w13_weight_scale" in weight_name:
1324
+ loaded_weight_hidden_out = loaded_weight.shape[-2]
1325
+ param_hidden_out = param.data.shape[-2] * self.tp_size
1326
+ if loaded_weight_hidden_out == param_hidden_out:
1327
+ self._load_combined_w13_weight_scale(
1328
+ shard_dim=shard_dim,
1329
+ loaded_weight=loaded_weight,
1330
+ param=param,
1331
+ tp_rank=self.tp_rank,
1332
+ )
1333
+ return True if return_success else None
1334
+
1335
+ # For other weights, call _load_model_weight_or_group_weight_scale()
1336
+ # to load it.
1337
+ if "weight" in weight_name:
1338
+ self._load_model_weight_or_group_weight_scale(
1339
+ shard_id=shard_id,
1340
+ shard_dim=shard_dim,
1341
+ loaded_weight=loaded_weight,
1342
+ expert_data=expert_data,
1343
+ tp_rank=self.tp_rank)
1344
+ return True if return_success else None
1345
+
1346
+ # Case weight scales, zero_points and offset, weight/input global scales
1347
+ if ("scale" in weight_name or "zero" in weight_name
1348
+ or "offset" in weight_name):
1349
+ # load the weight scales and zp based on the quantization scheme
1350
+ # supported weight scales/zp can be found in
1351
+ # FusedMoeWeightScaleSupported
1352
+ # TODO @dsikka: once hardened, refactor to use vLLM Parameters
1353
+ # specific to each case
1354
+ quant_method = getattr(param, "quant_method", None)
1355
+ if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
1356
+ self._load_per_channel_weight_scale(
1357
+ shard_id=shard_id,
1358
+ shard_dim=shard_dim,
1359
+ loaded_weight=loaded_weight,
1360
+ expert_data=expert_data,
1361
+ tp_rank=self.tp_rank)
1362
+ elif quant_method in [
1363
+ FusedMoeWeightScaleSupported.GROUP.value,
1364
+ FusedMoeWeightScaleSupported.BLOCK.value,
1365
+ ]:
1366
+ self._load_model_weight_or_group_weight_scale(
1367
+ shard_id=shard_id,
1368
+ shard_dim=shard_dim,
1369
+ loaded_weight=loaded_weight,
1370
+ expert_data=expert_data,
1371
+ tp_rank=self.tp_rank,
1372
+ load_full_w2=getattr(param, "load_full_w2", False))
1373
+ elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
1374
+ self._load_per_tensor_weight_scale(shard_id=shard_id,
1375
+ param=param,
1376
+ loaded_weight=loaded_weight,
1377
+ expert_id=expert_id)
1378
+ else:
1379
+ WEIGHT_SCALE_SUPPORTED = [
1380
+ e.value for e in FusedMoeWeightScaleSupported
1381
+ ]
1382
+ raise ValueError(
1383
+ f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
1384
+ return True if return_success else None
1385
+
1386
+ # Case weight_shape
1387
+ if "weight_shape" in weight_name:
1388
+ # only required by compressed-tensors
1389
+ self._load_single_value(param=param,
1390
+ loaded_weight=loaded_weight,
1391
+ expert_id=expert_id)
1392
+ return True if return_success else None
1393
+
1394
+ # Case model weights
1395
+ if "weight" in weight_name:
1396
+ self._load_model_weight_or_group_weight_scale(
1397
+ shard_id=shard_id,
1398
+ shard_dim=shard_dim,
1399
+ loaded_weight=loaded_weight,
1400
+ expert_data=expert_data,
1401
+ tp_rank=self.tp_rank)
1402
+ return True if return_success else None
1403
+
1404
+ return False if return_success else None
1405
+
1406
+ def get_expert_weights(self) -> Iterable[torch.Tensor]:
1407
+ weights = list(self.named_parameters())
1408
+ assert all(weight.is_contiguous() for _, weight in weights)
1409
+
1410
+ # Filter out the non-expert weights.
1411
+ # `e_score_correction_bias` is a bias for each logical expert,
1412
+ # with shape (num_logical_experts,), not an expert weight.
1413
+ NON_EXPERT_WEIGHTS = {
1414
+ "e_score_correction_bias",
1415
+ }
1416
+
1417
+ return [
1418
+ weight.view(self.local_num_experts, -1) for name, weight in weights
1419
+ if name not in NON_EXPERT_WEIGHTS
1420
+ and not name.startswith("_shared_experts.")
1421
+ ]
1422
+
1423
+ def set_eplb_state(
1424
+ self,
1425
+ moe_layer_idx: int,
1426
+ expert_load_view: torch.Tensor,
1427
+ logical_to_physical_map: torch.Tensor,
1428
+ logical_replica_count: torch.Tensor,
1429
+ ) -> None:
1430
+ """
1431
+ Register the EPLB state in this layer.
1432
+
1433
+ This is used later in forward pass, where we get the expert mapping
1434
+ and record the load metrics in `expert_load_view`.
1435
+ """
1436
+ self.expert_load_view = expert_load_view[moe_layer_idx]
1437
+ self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx]
1438
+ self.logical_replica_count = logical_replica_count[moe_layer_idx]
1439
+
1440
+ @staticmethod
1441
+ def select_experts(
1442
+ hidden_states: torch.Tensor,
1443
+ router_logits: torch.Tensor,
1444
+ top_k: int,
1445
+ use_grouped_topk: bool,
1446
+ renormalize: bool,
1447
+ topk_group: Optional[int] = None,
1448
+ num_expert_group: Optional[int] = None,
1449
+ custom_routing_function: Optional[Callable] = None,
1450
+ scoring_func: str = "softmax",
1451
+ routed_scaling_factor: float = 1.0,
1452
+ e_score_correction_bias: Optional[torch.Tensor] = None,
1453
+ indices_type: Optional[torch.dtype] = None,
1454
+ enable_eplb: bool = False,
1455
+ expert_map: Optional[torch.Tensor] = None,
1456
+ expert_load_view: Optional[torch.Tensor] = None,
1457
+ logical_to_physical_map: Optional[torch.Tensor] = None,
1458
+ logical_replica_count: Optional[torch.Tensor] = None,
1459
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1460
+ """
1461
+ Route the input hidden states to the top-k experts based on the
1462
+ router logits.
1463
+
1464
+ Returns:
1465
+ (topk_weights, topk_ids) (tuple[torch.Tensor, torch.Tensor]):
1466
+ The weights and *global physical* expert ids of the top-k experts.
1467
+
1468
+ **Compatibility**: When EPLB is not enabled, the returned ids are
1469
+ equivalent to global logical ids, so should be compatible with
1470
+ plain MoE implementations without redundant experts.
1471
+ """
1472
+ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
1473
+
1474
+ # Check if we should use a routing simulation strategy
1475
+ routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
1476
+ if routing_strategy != "":
1477
+ return RoutingSimulator.simulate_routing(
1478
+ hidden_states=hidden_states,
1479
+ router_logits=router_logits,
1480
+ strategy_name=routing_strategy,
1481
+ top_k=top_k,
1482
+ indices_type=indices_type)
1483
+
1484
+ # DeepSeekv2 uses grouped_top_k
1485
+ if use_grouped_topk:
1486
+ assert topk_group is not None
1487
+ assert num_expert_group is not None
1488
+ topk_weights, topk_ids = grouped_topk(
1489
+ hidden_states=hidden_states,
1490
+ gating_output=router_logits,
1491
+ topk=top_k,
1492
+ renormalize=renormalize,
1493
+ num_expert_group=num_expert_group,
1494
+ topk_group=topk_group,
1495
+ scoring_func=scoring_func,
1496
+ routed_scaling_factor=routed_scaling_factor,
1497
+ e_score_correction_bias=e_score_correction_bias)
1498
+ if indices_type is not None:
1499
+ topk_ids = topk_ids.to(dtype=indices_type)
1500
+ elif custom_routing_function is None:
1501
+ topk_weights, topk_ids, token_expert_indices = fused_topk(
1502
+ hidden_states=hidden_states,
1503
+ gating_output=router_logits,
1504
+ topk=top_k,
1505
+ renormalize=renormalize,
1506
+ indices_type=indices_type,
1507
+ )
1508
+ else:
1509
+ topk_weights, topk_ids = custom_routing_function(
1510
+ hidden_states=hidden_states,
1511
+ gating_output=router_logits,
1512
+ topk=top_k,
1513
+ renormalize=renormalize)
1514
+ if indices_type is not None:
1515
+ topk_ids = topk_ids.to(dtype=indices_type)
1516
+
1517
+ if enable_eplb:
1518
+ assert expert_load_view is not None
1519
+ assert logical_to_physical_map is not None
1520
+ assert logical_replica_count is not None
1521
+
1522
+ # 1. Convert the logical expert ids to physical expert ids
1523
+ # Directly select a random replica for each logical expert
1524
+
1525
+ # TODO: maybe optimize this by using specified kernels,
1526
+ # or compute pseudo-random indices by modulo
1527
+
1528
+ # In case `indices_type` is not `torch.long` or `torch.int`,
1529
+ # e.g. `torch.uint32` as required by dispatch/combine kernels
1530
+ topk_ids_long = topk_ids.long()
1531
+ replica_indices = (
1532
+ torch.rand_like(topk_ids, dtype=torch.float) *
1533
+ logical_replica_count[topk_ids_long]).long().unsqueeze(-1)
1534
+ physical_ids = logical_to_physical_map[topk_ids_long].gather(
1535
+ -1, replica_indices).squeeze(-1)
1536
+
1537
+ topk_ids = physical_ids
1538
+
1539
+ # 2. Record expert load metrics.
1540
+
1541
+ # TODO(bowen): When using `FusedMoEModularKernel`, this
1542
+ # can be done in a more unified way, since
1543
+ # `FusedMoEPrepareAndFinalize` will return the expert
1544
+ # token count, in some cases directly from the kernel.
1545
+ # However, now there are many code paths not using
1546
+ # the modular kernel, e.g. calling `fused_experts`,
1547
+ # so we decide to keep the logic here.
1548
+ #
1549
+ # If later refactor moved all the MoE kernel calls
1550
+ # to the modular kernel, we can move this logic there
1551
+ # to achieve better efficiency.
1552
+
1553
+ # `expert_load_view`: (num_physical_experts,)
1554
+
1555
+ topk_ids_flatten = topk_ids.flatten()
1556
+
1557
+ # Performance optimization:
1558
+ # `masked_fill` is significantly faster than `masked_select`
1559
+ invalid_mask = topk_ids_flatten < 0
1560
+ # Replace invalid expert ids with 0 (just a dummy position)
1561
+ # to avoid out-of-bounds errors in scatter_add_
1562
+ index = topk_ids_flatten.masked_fill_(invalid_mask, 0)
1563
+ # `src` is the valid mask, which is 1 for valid and 0 for invalid
1564
+ src = ~invalid_mask
1565
+
1566
+ expert_load_view.scatter_add_(dim=0,
1567
+ index=index.long(),
1568
+ src=src.to(expert_load_view))
1569
+
1570
+ topk_ids = topk_ids.to(dtype=indices_type)
1571
+
1572
+ assert topk_ids.dtype == indices_type or indices_type is None
1573
+
1574
+ return topk_weights, topk_ids
1575
+
1576
+ def must_reduce_shared_expert_outputs(self) -> bool:
1577
+ """
1578
+ The shared_experts are typically computed using the RowParallelLinear
1579
+ layer. The result of this function is typically used as
1580
+ the reduce_results argument to the module.
1581
+ When just tensor-parallel is used, it is not required to reduce
1582
+ the shared_experts results immediately. Instead we reduce at the
1583
+ once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
1584
+ With EP and all2all kernels - this is no longer viable as all
1585
+ GPU ranks in DP, produce the complete set of hidden_states.
1586
+ Therefore it is required that we reduce the shared_experts output
1587
+ early.
1588
+ """
1589
+ return (self.use_pplx_kernels or self.use_deepep_ht_kernels
1590
+ or self.use_deepep_ll_kernels)
1591
+
1592
+ def maybe_all_reduce_tensor_model_parallel(
1593
+ self, final_hidden_states: torch.Tensor):
1594
+ """
1595
+ The pplx combine kernel reduces across GPU ranks by default.
1596
+ """
1597
+ if (self.use_pplx_kernels or self.use_deepep_ht_kernels
1598
+ or self.use_deepep_ll_kernels):
1599
+ return final_hidden_states
1600
+ else:
1601
+ return tensor_model_parallel_all_reduce(final_hidden_states)
1602
+
1603
+ def forward_native(
1604
+ self,
1605
+ hidden_states: torch.Tensor,
1606
+ router_logits: torch.Tensor,
1607
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
1608
+ og_hidden_states = hidden_states.shape[-1]
1609
+ if self.hidden_size != og_hidden_states:
1610
+ hidden_states = F.pad(hidden_states,
1611
+ (0, self.hidden_size - og_hidden_states),
1612
+ mode='constant',
1613
+ value=0.0)
1614
+
1615
+ if self.shared_experts is None:
1616
+ if current_platform.is_tpu():
1617
+ # TODO: Once the OOM issue for the TPU backend is resolved, we
1618
+ # will switch to using the moe_forward custom op.
1619
+ fused_output = self.forward_impl(hidden_states, router_logits)
1620
+ assert not isinstance(fused_output, tuple)
1621
+ else:
1622
+ fused_output = torch.ops.vllm.moe_forward(
1623
+ hidden_states, router_logits, self.layer_name)
1624
+ return fused_output[..., :og_hidden_states]
1625
+ else:
1626
+ if current_platform.is_tpu():
1627
+ # TODO: Once the OOM issue for the TPU backend is resolved, we
1628
+ # will switch to using the moe_forward custom op.
1629
+ shared_output, fused_output = self.forward_impl(
1630
+ hidden_states, router_logits)
1631
+ else:
1632
+ shared_output, fused_output = torch.ops.vllm.moe_forward_shared(
1633
+ hidden_states, router_logits, self.layer_name)
1634
+ return (shared_output[..., :og_hidden_states],
1635
+ fused_output[..., :og_hidden_states])
1636
+
1637
+ def forward_cuda(
1638
+ self,
1639
+ hidden_states: torch.Tensor,
1640
+ router_logits: torch.Tensor,
1641
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
1642
+ return self.forward_native(hidden_states, router_logits)
1643
+
1644
+ def forward_impl_chunked(
1645
+ self,
1646
+ full_hidden_states: torch.Tensor,
1647
+ full_router_logits: torch.Tensor,
1648
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
1649
+ assert self.batched_hidden_states is not None
1650
+ assert self.batched_router_logits is not None
1651
+ assert self.batched_hidden_states.dtype == full_hidden_states.dtype
1652
+ assert self.batched_router_logits.dtype == full_router_logits.dtype
1653
+ # Check size compatibility.
1654
+ assert (
1655
+ self.batched_hidden_states.size(-1) == full_hidden_states.size(-1))
1656
+ assert (
1657
+ self.batched_router_logits.size(-1) == full_router_logits.size(-1))
1658
+
1659
+ full_fused_final_hidden_states = torch.empty_like(full_hidden_states)
1660
+ if self.shared_experts is not None:
1661
+ full_shared_final_hidden_states = torch.empty_like(
1662
+ full_hidden_states)
1663
+
1664
+ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
1665
+ chunk_size = chunk_end - chunk_start
1666
+ hidden_states = full_hidden_states[chunk_start:chunk_end, :]
1667
+ router_logits = full_router_logits[chunk_start:chunk_end, :]
1668
+
1669
+ assert (self.batched_hidden_states.size(0) # type: ignore
1670
+ >= chunk_size)
1671
+ assert (self.batched_router_logits.size(0) # type: ignore
1672
+ >= chunk_size)
1673
+ staged_hidden_states = self.batched_hidden_states[:
1674
+ chunk_size, :] # type: ignore
1675
+ staged_router_logits = self.batched_router_logits[:
1676
+ chunk_size, :] # type: ignore
1677
+ staged_hidden_states.copy_(hidden_states, non_blocking=True)
1678
+ staged_router_logits.copy_(router_logits, non_blocking=True)
1679
+
1680
+ # Matrix multiply.
1681
+ final_hidden_states = self.quant_method.apply(
1682
+ layer=self,
1683
+ x=staged_hidden_states,
1684
+ router_logits=staged_router_logits,
1685
+ top_k=self.top_k,
1686
+ renormalize=self.renormalize,
1687
+ use_grouped_topk=self.use_grouped_topk,
1688
+ global_num_experts=self.global_num_experts,
1689
+ expert_map=self.expert_map,
1690
+ topk_group=self.topk_group,
1691
+ num_expert_group=self.num_expert_group,
1692
+ custom_routing_function=self.custom_routing_function,
1693
+ scoring_func=self.scoring_func,
1694
+ routed_scaling_factor=self.routed_scaling_factor,
1695
+ e_score_correction_bias=self.e_score_correction_bias,
1696
+ activation=self.activation,
1697
+ enable_eplb=self.enable_eplb,
1698
+ expert_load_view=self.expert_load_view,
1699
+ logical_to_physical_map=self.logical_to_physical_map,
1700
+ logical_replica_count=self.logical_replica_count,
1701
+ )
1702
+
1703
+ assert self.shared_experts is None or isinstance(
1704
+ final_hidden_states, tuple)
1705
+
1706
+ if not skip_result_store:
1707
+ if self.shared_experts is None:
1708
+ full_fused_final_hidden_states[
1709
+ chunk_start:chunk_end, :].copy_(final_hidden_states,
1710
+ non_blocking=True)
1711
+ else:
1712
+ full_shared_final_hidden_states[
1713
+ chunk_start:chunk_end, :].copy_(final_hidden_states[0],
1714
+ non_blocking=True)
1715
+ full_fused_final_hidden_states[
1716
+ chunk_start:chunk_end, :].copy_(final_hidden_states[1],
1717
+ non_blocking=True)
1718
+
1719
+ ctx = get_forward_context()
1720
+ # flashinfer_cutlass_kernels can handle: optional DP + TP/EP
1721
+ max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu
1722
+ moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
1723
+
1724
+ # If the input to the MoE is sequence parallel then divide by sp_size
1725
+ # to find the maximum number of tokens for any individual dispatcher.
1726
+ if self.is_sequence_parallel:
1727
+ max_tokens_across_dispatchers = cdiv(max_tokens_across_dispatchers,
1728
+ self.sp_size)
1729
+
1730
+ num_tokens = full_hidden_states.size(0)
1731
+ for chunk_idx, chunk_start_ in enumerate(
1732
+ range(0, max_tokens_across_dispatchers,
1733
+ moe_dp_chunk_size_per_rank)):
1734
+ chunk_start = chunk_start_
1735
+ chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
1736
+ max_tokens_across_dispatchers)
1737
+ # clamp start and end
1738
+ chunk_start = min(chunk_start, num_tokens - 1)
1739
+ chunk_end = min(chunk_end, num_tokens)
1740
+ with ctx.dp_metadata.chunked_sizes(moe_dp_chunk_size_per_rank,
1741
+ chunk_idx):
1742
+ process_chunk(chunk_start,
1743
+ chunk_end,
1744
+ skip_result_store=chunk_start_ >= num_tokens)
1745
+
1746
+ if self.shared_experts is None:
1747
+ return full_fused_final_hidden_states
1748
+ else:
1749
+ return (full_shared_final_hidden_states,
1750
+ full_fused_final_hidden_states)
1751
+
1752
+ def forward_impl(
1753
+ self,
1754
+ hidden_states: torch.Tensor,
1755
+ router_logits: torch.Tensor,
1756
+ ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
1757
+ assert self.quant_method is not None
1758
+ # Route to the chunked forward path using the FlashInfer Cutlass kernel
1759
+ # only when data parallelism (DP) is enabled.
1760
+ use_flashinfer_cutlass_kernels = (
1761
+ self.dp_size > 1
1762
+ and self.moe_config.use_flashinfer_cutlass_kernels)
1763
+ if (self.moe_parallel_config.use_pplx_kernels
1764
+ or self.moe_parallel_config.use_deepep_ll_kernels
1765
+ or use_flashinfer_cutlass_kernels):
1766
+ return self.forward_impl_chunked(hidden_states, router_logits)
1767
+
1768
+ do_naive_dispatch_combine: bool = (
1769
+ self.dp_size > 1
1770
+ and not self.moe_parallel_config.use_deepep_ht_kernels
1771
+ and not self.moe_config.use_flashinfer_cutlass_kernels)
1772
+
1773
+ # If there are shared experts but we are not using a modular kernel, the
1774
+ # shared experts must be called here
1775
+ if (not isinstance(self.quant_method.fused_experts,
1776
+ FusedMoEModularKernel)
1777
+ and self.shared_experts is not None):
1778
+ shared_output = self.shared_experts(hidden_states)
1779
+ else:
1780
+ shared_output = None
1781
+
1782
+ if do_naive_dispatch_combine:
1783
+ hidden_states, router_logits = get_ep_group().dispatch(
1784
+ hidden_states, router_logits)
1785
+
1786
+ # Matrix multiply.
1787
+ final_hidden_states = self.quant_method.apply(
1788
+ layer=self,
1789
+ x=hidden_states,
1790
+ router_logits=router_logits,
1791
+ top_k=self.top_k,
1792
+ renormalize=self.renormalize,
1793
+ use_grouped_topk=self.use_grouped_topk,
1794
+ global_num_experts=self.global_num_experts,
1795
+ expert_map=self.expert_map,
1796
+ topk_group=self.topk_group,
1797
+ num_expert_group=self.num_expert_group,
1798
+ custom_routing_function=self.custom_routing_function,
1799
+ scoring_func=self.scoring_func,
1800
+ routed_scaling_factor=self.routed_scaling_factor,
1801
+ e_score_correction_bias=self.e_score_correction_bias,
1802
+ activation=self.activation,
1803
+ apply_router_weight_on_input=self.apply_router_weight_on_input,
1804
+ enable_eplb=self.enable_eplb,
1805
+ expert_load_view=self.expert_load_view,
1806
+ logical_to_physical_map=self.logical_to_physical_map,
1807
+ logical_replica_count=self.logical_replica_count,
1808
+ )
1809
+
1810
+ if shared_output is not None:
1811
+ assert not isinstance(final_hidden_states, tuple)
1812
+ assert self.shared_experts is not None
1813
+ final_hidden_states = (
1814
+ shared_output,
1815
+ final_hidden_states,
1816
+ )
1817
+
1818
+ def reduce_output(states: torch.Tensor,
1819
+ do_combine: bool = True) -> torch.Tensor:
1820
+ if do_naive_dispatch_combine and do_combine:
1821
+ states = get_ep_group().combine(states)
1822
+
1823
+ if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
1824
+ states = self.maybe_all_reduce_tensor_model_parallel(states)
1825
+
1826
+ return states
1827
+
1828
+ if self.shared_experts is None:
1829
+ assert not isinstance(final_hidden_states, tuple)
1830
+ return reduce_output(final_hidden_states)
1831
+ else:
1832
+ return (
1833
+ reduce_output(final_hidden_states[0], do_combine=False),
1834
+ reduce_output(final_hidden_states[1]),
1835
+ )
1836
+
1837
+ @classmethod
1838
+ def make_expert_params_mapping(
1839
+ cls,
1840
+ ckpt_gate_proj_name: str,
1841
+ ckpt_down_proj_name: str,
1842
+ ckpt_up_proj_name: str,
1843
+ num_experts: int,
1844
+ num_redundant_experts: int = 0) -> list[tuple[str, str, int, str]]:
1845
+
1846
+ num_physical_experts = num_experts + num_redundant_experts
1847
+
1848
+ # In the returned mapping:
1849
+ # - `expert_id` is the physical expert id
1850
+ # - `weight_name` contains the weight name of the logical expert
1851
+ # So that we should map the expert id to logical in `weight_name`
1852
+ physical_to_logical_map = \
1853
+ EplbState.build_initial_global_physical_to_logical_map(
1854
+ num_experts, num_redundant_experts)
1855
+
1856
+ return [
1857
+ # (param_name, weight_name, expert_id, shard_id)
1858
+ ("experts.w13_" if weight_name
1859
+ in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
1860
+ f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.",
1861
+ expert_id, shard_id) for expert_id in range(num_physical_experts)
1862
+ for shard_id, weight_name in [
1863
+ ("w1", ckpt_gate_proj_name),
1864
+ ("w2", ckpt_down_proj_name),
1865
+ ("w3", ckpt_up_proj_name),
1866
+ ]
1867
+ ]
1868
+
1869
+ def extra_repr(self) -> str:
1870
+
1871
+ s = (
1872
+ f"global_num_experts={self.global_num_experts}, "
1873
+ f"local_num_experts={self.local_num_experts}, "
1874
+ f"top_k={self.top_k}, "
1875
+ f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501
1876
+ f"tp_size={self.tp_size},\n"
1877
+ f"ep_size={self.ep_size}, "
1878
+ f"reduce_results={self.reduce_results}, "
1879
+ f"renormalize={self.renormalize}, "
1880
+ f"use_grouped_topk={self.use_grouped_topk}")
1881
+
1882
+ if self.use_grouped_topk:
1883
+ s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}" # noqa: E501
1884
+
1885
+ s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'" # noqa: E501
1886
+
1887
+ return s
1888
+
1889
+
1890
+ def moe_forward(
1891
+ hidden_states: torch.Tensor,
1892
+ router_logits: torch.Tensor,
1893
+ layer_name: str,
1894
+ ) -> torch.Tensor:
1895
+ forward_context: ForwardContext = get_forward_context()
1896
+ self = forward_context.no_compile_layers[layer_name]
1897
+ assert self.shared_experts is None
1898
+ return self.forward_impl(hidden_states, router_logits)
1899
+
1900
+
1901
+ def moe_forward_fake(
1902
+ hidden_states: torch.Tensor,
1903
+ router_logits: torch.Tensor,
1904
+ layer_name: str,
1905
+ ) -> torch.Tensor:
1906
+ return torch.empty_like(hidden_states)
1907
+
1908
+
1909
+ direct_register_custom_op(
1910
+ op_name="moe_forward",
1911
+ op_func=moe_forward,
1912
+ mutates_args=["hidden_states"],
1913
+ fake_impl=moe_forward_fake,
1914
+ dispatch_key=current_platform.dispatch_key,
1915
+ tags=(torch.Tag.needs_fixed_stride_order, ),
1916
+ )
1917
+
1918
+
1919
+ def moe_forward_shared(
1920
+ hidden_states: torch.Tensor,
1921
+ router_logits: torch.Tensor,
1922
+ layer_name: str,
1923
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1924
+ forward_context: ForwardContext = get_forward_context()
1925
+ self = forward_context.no_compile_layers[layer_name]
1926
+ assert self.shared_experts is not None
1927
+ return self.forward_impl(hidden_states, router_logits)
1928
+
1929
+
1930
+ def moe_forward_shared_fake(
1931
+ hidden_states: torch.Tensor,
1932
+ router_logits: torch.Tensor,
1933
+ layer_name: str,
1934
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1935
+ shared_out = torch.empty_like(hidden_states)
1936
+ fused_out = torch.empty_like(hidden_states)
1937
+ return shared_out, fused_out
1938
+
1939
+
1940
+ direct_register_custom_op(
1941
+ op_name="moe_forward_shared",
1942
+ op_func=moe_forward_shared,
1943
+ mutates_args=["hidden_states"],
1944
+ fake_impl=moe_forward_shared_fake,
1945
+ dispatch_key=current_platform.dispatch_key,
1946
+ tags=(torch.Tag.needs_fixed_stride_order, ),
1947
+ )
1948
+
1949
+ # Mark the FusedMoE weight_loader as supporting MoE-specific parameters
1950
+ # to avoid expensive runtime reflection in model loading code
1951
+ FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined]