vllm-cpu-avx512bf16 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1175) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1742 -0
  4. vllm/_ipex_ops.py +243 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +15 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +44 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +33 -0
  16. vllm/assets/video.py +114 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +305 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1494 -0
  23. vllm/attention/backends/flash_attn.py +999 -0
  24. vllm/attention/backends/flashinfer.py +1100 -0
  25. vllm/attention/backends/flashmla.py +242 -0
  26. vllm/attention/backends/hpu_attn.py +309 -0
  27. vllm/attention/backends/ipex_attn.py +394 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1381 -0
  30. vllm/attention/backends/pallas.py +347 -0
  31. vllm/attention/backends/placeholder_attn.py +399 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +970 -0
  34. vllm/attention/backends/torch_sdpa.py +691 -0
  35. vllm/attention/backends/triton_mla.py +113 -0
  36. vllm/attention/backends/utils.py +609 -0
  37. vllm/attention/backends/xformers.py +798 -0
  38. vllm/attention/layer.py +452 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +245 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +367 -0
  45. vllm/attention/ops/flashmla.py +115 -0
  46. vllm/attention/ops/hpu_paged_attn.py +87 -0
  47. vllm/attention/ops/ipex_attn.py +194 -0
  48. vllm/attention/ops/merge_attn_states.py +42 -0
  49. vllm/attention/ops/nki_flash_attn.py +905 -0
  50. vllm/attention/ops/paged_attn.py +255 -0
  51. vllm/attention/ops/prefix_prefill.py +901 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +99 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  54. vllm/attention/ops/triton_decode_attention.py +673 -0
  55. vllm/attention/ops/triton_flash_attention.py +1374 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  57. vllm/attention/ops/triton_unified_attention.py +337 -0
  58. vllm/attention/selector.py +186 -0
  59. vllm/attention/utils/fa_utils.py +54 -0
  60. vllm/beam_search.py +82 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +921 -0
  63. vllm/benchmarks/endpoint_request_func.py +160 -0
  64. vllm/benchmarks/latency.py +184 -0
  65. vllm/benchmarks/serve.py +925 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +69 -0
  68. vllm/collect_env.py +818 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +88 -0
  71. vllm/compilation/backends.py +560 -0
  72. vllm/compilation/base_piecewise_backend.py +71 -0
  73. vllm/compilation/collective_fusion.py +126 -0
  74. vllm/compilation/compiler_interface.py +533 -0
  75. vllm/compilation/counter.py +33 -0
  76. vllm/compilation/cuda_piecewise_backend.py +213 -0
  77. vllm/compilation/decorators.py +249 -0
  78. vllm/compilation/fix_functionalization.py +190 -0
  79. vllm/compilation/fusion.py +617 -0
  80. vllm/compilation/fx_utils.py +61 -0
  81. vllm/compilation/inductor_pass.py +114 -0
  82. vllm/compilation/monitor.py +38 -0
  83. vllm/compilation/multi_output_match.py +108 -0
  84. vllm/compilation/noop_elimination.py +136 -0
  85. vllm/compilation/pass_manager.py +77 -0
  86. vllm/compilation/sequence_parallelism.py +267 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  88. vllm/compilation/vllm_inductor_pass.py +66 -0
  89. vllm/compilation/wrapper.py +129 -0
  90. vllm/config.py +4600 -0
  91. vllm/connections.py +173 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +398 -0
  95. vllm/core/block/common.py +370 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  97. vllm/core/block/interfaces.py +318 -0
  98. vllm/core/block/naive_block.py +465 -0
  99. vllm/core/block/prefix_caching_block.py +1134 -0
  100. vllm/core/block/utils.py +27 -0
  101. vllm/core/block_manager.py +520 -0
  102. vllm/core/evictor.py +156 -0
  103. vllm/core/interfaces.py +134 -0
  104. vllm/core/placeholder_block_space_manager.py +99 -0
  105. vllm/core/scheduler.py +2092 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +280 -0
  108. vllm/distributed/__init__.py +5 -0
  109. vllm/distributed/communication_op.py +40 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +126 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +144 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +167 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +303 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +258 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  120. vllm/distributed/device_communicators/pynccl.py +217 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +541 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  125. vllm/distributed/kv_events.py +296 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +11 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +126 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +202 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +91 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +5 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +259 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +189 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +851 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  152. vllm/distributed/parallel_state.py +1294 -0
  153. vllm/distributed/utils.py +520 -0
  154. vllm/engine/__init__.py +0 -0
  155. vllm/engine/arg_utils.py +1649 -0
  156. vllm/engine/async_llm_engine.py +1274 -0
  157. vllm/engine/async_timeout.py +191 -0
  158. vllm/engine/llm_engine.py +2153 -0
  159. vllm/engine/metrics.py +717 -0
  160. vllm/engine/metrics_types.py +96 -0
  161. vllm/engine/multiprocessing/__init__.py +188 -0
  162. vllm/engine/multiprocessing/client.py +755 -0
  163. vllm/engine/multiprocessing/engine.py +459 -0
  164. vllm/engine/output_processor/__init__.py +0 -0
  165. vllm/engine/output_processor/interfaces.py +74 -0
  166. vllm/engine/output_processor/multi_step.py +215 -0
  167. vllm/engine/output_processor/single_step.py +144 -0
  168. vllm/engine/output_processor/stop_checker.py +130 -0
  169. vllm/engine/output_processor/util.py +27 -0
  170. vllm/engine/protocol.py +310 -0
  171. vllm/entrypoints/__init__.py +0 -0
  172. vllm/entrypoints/api_server.py +177 -0
  173. vllm/entrypoints/chat_utils.py +1298 -0
  174. vllm/entrypoints/cli/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/base.py +38 -0
  177. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  178. vllm/entrypoints/cli/benchmark/main.py +53 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  180. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  181. vllm/entrypoints/cli/collect_env.py +34 -0
  182. vllm/entrypoints/cli/main.py +62 -0
  183. vllm/entrypoints/cli/openai.py +204 -0
  184. vllm/entrypoints/cli/serve.py +141 -0
  185. vllm/entrypoints/cli/types.py +24 -0
  186. vllm/entrypoints/launcher.py +146 -0
  187. vllm/entrypoints/llm.py +1503 -0
  188. vllm/entrypoints/logger.py +49 -0
  189. vllm/entrypoints/openai/__init__.py +0 -0
  190. vllm/entrypoints/openai/api_server.py +1376 -0
  191. vllm/entrypoints/openai/cli_args.py +306 -0
  192. vllm/entrypoints/openai/logits_processors.py +89 -0
  193. vllm/entrypoints/openai/protocol.py +1890 -0
  194. vllm/entrypoints/openai/run_batch.py +439 -0
  195. vllm/entrypoints/openai/serving_chat.py +1192 -0
  196. vllm/entrypoints/openai/serving_classification.py +159 -0
  197. vllm/entrypoints/openai/serving_completion.py +590 -0
  198. vllm/entrypoints/openai/serving_embedding.py +200 -0
  199. vllm/entrypoints/openai/serving_engine.py +985 -0
  200. vllm/entrypoints/openai/serving_models.py +314 -0
  201. vllm/entrypoints/openai/serving_pooling.py +231 -0
  202. vllm/entrypoints/openai/serving_score.py +432 -0
  203. vllm/entrypoints/openai/serving_tokenization.py +151 -0
  204. vllm/entrypoints/openai/serving_transcription.py +421 -0
  205. vllm/entrypoints/openai/tool_parsers/__init__.py +22 -0
  206. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  207. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +369 -0
  208. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +258 -0
  209. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +236 -0
  210. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  211. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +215 -0
  212. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +307 -0
  213. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +302 -0
  214. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +266 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  216. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +111 -0
  217. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +296 -0
  218. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  219. vllm/entrypoints/score_utils.py +49 -0
  220. vllm/entrypoints/ssl.py +74 -0
  221. vllm/entrypoints/utils.py +219 -0
  222. vllm/env_override.py +34 -0
  223. vllm/envs.py +896 -0
  224. vllm/executor/__init__.py +0 -0
  225. vllm/executor/executor_base.py +400 -0
  226. vllm/executor/mp_distributed_executor.py +243 -0
  227. vllm/executor/msgspec_utils.py +29 -0
  228. vllm/executor/multiproc_worker_utils.py +312 -0
  229. vllm/executor/ray_distributed_executor.py +700 -0
  230. vllm/executor/ray_utils.py +398 -0
  231. vllm/executor/uniproc_executor.py +138 -0
  232. vllm/forward_context.py +147 -0
  233. vllm/inputs/__init__.py +40 -0
  234. vllm/inputs/data.py +330 -0
  235. vllm/inputs/parse.py +150 -0
  236. vllm/inputs/preprocess.py +908 -0
  237. vllm/inputs/registry.py +214 -0
  238. vllm/jsontree.py +79 -0
  239. vllm/logger.py +211 -0
  240. vllm/logging_utils/__init__.py +7 -0
  241. vllm/logging_utils/dump_input.py +84 -0
  242. vllm/logging_utils/formatter.py +17 -0
  243. vllm/logits_process.py +118 -0
  244. vllm/lora/__init__.py +0 -0
  245. vllm/lora/fully_sharded_layers.py +354 -0
  246. vllm/lora/layers.py +1284 -0
  247. vllm/lora/lora.py +198 -0
  248. vllm/lora/models.py +817 -0
  249. vllm/lora/ops/__init__.py +0 -0
  250. vllm/lora/ops/torch_ops/__init__.py +15 -0
  251. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  252. vllm/lora/ops/triton_ops/__init__.py +11 -0
  253. vllm/lora/ops/triton_ops/kernel_utils.py +242 -0
  254. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  255. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  256. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  257. vllm/lora/ops/triton_ops/utils.py +119 -0
  258. vllm/lora/ops/xla_ops/__init__.py +6 -0
  259. vllm/lora/ops/xla_ops/lora_ops.py +106 -0
  260. vllm/lora/ops/xla_ops/pallas.py +133 -0
  261. vllm/lora/peft_helper.py +135 -0
  262. vllm/lora/punica_wrapper/__init__.py +9 -0
  263. vllm/lora/punica_wrapper/punica_base.py +484 -0
  264. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  265. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  266. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  267. vllm/lora/punica_wrapper/punica_selector.py +19 -0
  268. vllm/lora/punica_wrapper/punica_tpu.py +325 -0
  269. vllm/lora/punica_wrapper/utils.py +163 -0
  270. vllm/lora/request.py +98 -0
  271. vllm/lora/resolver.py +84 -0
  272. vllm/lora/utils.py +239 -0
  273. vllm/lora/worker_manager.py +253 -0
  274. vllm/model_executor/__init__.py +15 -0
  275. vllm/model_executor/custom_op.py +151 -0
  276. vllm/model_executor/guided_decoding/__init__.py +180 -0
  277. vllm/model_executor/guided_decoding/guidance_decoding.py +62 -0
  278. vllm/model_executor/guided_decoding/guidance_logits_processors.py +103 -0
  279. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  280. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  281. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  282. vllm/model_executor/guided_decoding/outlines_logits_processors.py +283 -0
  283. vllm/model_executor/guided_decoding/utils.py +241 -0
  284. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  285. vllm/model_executor/layers/__init__.py +0 -0
  286. vllm/model_executor/layers/activation.py +368 -0
  287. vllm/model_executor/layers/fused_moe/__init__.py +53 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  449. vllm/model_executor/layers/fused_moe/cutlass_moe.py +382 -0
  450. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +227 -0
  451. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +755 -0
  452. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +231 -0
  453. vllm/model_executor/layers/fused_moe/fused_moe.py +1722 -0
  454. vllm/model_executor/layers/fused_moe/layer.py +1366 -0
  455. vllm/model_executor/layers/fused_moe/modular_kernel.py +364 -0
  456. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +242 -0
  457. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  458. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +188 -0
  459. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  460. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +146 -0
  461. vllm/model_executor/layers/fused_moe/prepare_finalize.py +60 -0
  462. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +372 -0
  463. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +112 -0
  464. vllm/model_executor/layers/fused_moe/utils.py +97 -0
  465. vllm/model_executor/layers/layernorm.py +287 -0
  466. vllm/model_executor/layers/lightning_attn.py +651 -0
  467. vllm/model_executor/layers/linear.py +1523 -0
  468. vllm/model_executor/layers/logits_processor.py +196 -0
  469. vllm/model_executor/layers/mamba/__init__.py +0 -0
  470. vllm/model_executor/layers/mamba/mamba2_metadata.py +124 -0
  471. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  472. vllm/model_executor/layers/mamba/mamba_mixer2.py +615 -0
  473. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  474. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  475. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +413 -0
  476. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  477. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  478. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  479. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  480. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  481. vllm/model_executor/layers/pooler.py +343 -0
  482. vllm/model_executor/layers/quantization/__init__.py +156 -0
  483. vllm/model_executor/layers/quantization/aqlm.py +375 -0
  484. vllm/model_executor/layers/quantization/auto_round.py +308 -0
  485. vllm/model_executor/layers/quantization/awq.py +185 -0
  486. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  487. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  488. vllm/model_executor/layers/quantization/base_config.py +150 -0
  489. vllm/model_executor/layers/quantization/bitblas.py +460 -0
  490. vllm/model_executor/layers/quantization/bitsandbytes.py +397 -0
  491. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  492. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +644 -0
  493. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1252 -0
  494. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +21 -0
  495. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  496. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  497. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  498. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +92 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +120 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +214 -0
  505. vllm/model_executor/layers/quantization/deepspeedfp.py +194 -0
  506. vllm/model_executor/layers/quantization/experts_int8.py +195 -0
  507. vllm/model_executor/layers/quantization/fbgemm_fp8.py +171 -0
  508. vllm/model_executor/layers/quantization/fp8.py +876 -0
  509. vllm/model_executor/layers/quantization/gguf.py +564 -0
  510. vllm/model_executor/layers/quantization/gptq.py +277 -0
  511. vllm/model_executor/layers/quantization/gptq_bitblas.py +444 -0
  512. vllm/model_executor/layers/quantization/gptq_marlin.py +647 -0
  513. vllm/model_executor/layers/quantization/gptq_marlin_24.py +296 -0
  514. vllm/model_executor/layers/quantization/hqq_marlin.py +331 -0
  515. vllm/model_executor/layers/quantization/ipex_quant.py +249 -0
  516. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  517. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  518. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  519. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  520. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  521. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  522. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  523. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +130 -0
  524. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  525. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  526. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  527. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  528. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  529. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  530. vllm/model_executor/layers/quantization/kv_cache.py +138 -0
  531. vllm/model_executor/layers/quantization/marlin.py +260 -0
  532. vllm/model_executor/layers/quantization/modelopt.py +734 -0
  533. vllm/model_executor/layers/quantization/moe_wna16.py +448 -0
  534. vllm/model_executor/layers/quantization/neuron_quant.py +68 -0
  535. vllm/model_executor/layers/quantization/ptpc_fp8.py +126 -0
  536. vllm/model_executor/layers/quantization/qqq.py +274 -0
  537. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  538. vllm/model_executor/layers/quantization/quark/quark.py +440 -0
  539. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  540. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +8 -0
  541. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  542. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +125 -0
  543. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +145 -0
  544. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  545. vllm/model_executor/layers/quantization/quark/utils.py +104 -0
  546. vllm/model_executor/layers/quantization/schema.py +85 -0
  547. vllm/model_executor/layers/quantization/torchao.py +143 -0
  548. vllm/model_executor/layers/quantization/tpu_int8.py +120 -0
  549. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  550. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  551. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +207 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  754. vllm/model_executor/layers/quantization/utils/fp8_utils.py +611 -0
  755. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  756. vllm/model_executor/layers/quantization/utils/int8_utils.py +484 -0
  757. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  758. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  759. vllm/model_executor/layers/quantization/utils/marlin_utils.py +475 -0
  760. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +277 -0
  761. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +324 -0
  762. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  763. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +463 -0
  764. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +125 -0
  765. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +44 -0
  766. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +61 -0
  767. vllm/model_executor/layers/quantization/utils/quant_utils.py +572 -0
  768. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  769. vllm/model_executor/layers/rejection_sampler.py +405 -0
  770. vllm/model_executor/layers/resampler.py +269 -0
  771. vllm/model_executor/layers/rotary_embedding.py +1861 -0
  772. vllm/model_executor/layers/sampler.py +1203 -0
  773. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  774. vllm/model_executor/layers/typical_acceptance_sampler.py +165 -0
  775. vllm/model_executor/layers/utils.py +99 -0
  776. vllm/model_executor/layers/vocab_parallel_embedding.py +486 -0
  777. vllm/model_executor/model_loader/__init__.py +75 -0
  778. vllm/model_executor/model_loader/base_loader.py +24 -0
  779. vllm/model_executor/model_loader/bitsandbytes_loader.py +582 -0
  780. vllm/model_executor/model_loader/default_loader.py +295 -0
  781. vllm/model_executor/model_loader/dummy_loader.py +37 -0
  782. vllm/model_executor/model_loader/gguf_loader.py +113 -0
  783. vllm/model_executor/model_loader/neuron.py +475 -0
  784. vllm/model_executor/model_loader/neuronx_distributed.py +622 -0
  785. vllm/model_executor/model_loader/runai_streamer_loader.py +120 -0
  786. vllm/model_executor/model_loader/sharded_state_loader.py +211 -0
  787. vllm/model_executor/model_loader/tensorizer.py +632 -0
  788. vllm/model_executor/model_loader/tensorizer_loader.py +122 -0
  789. vllm/model_executor/model_loader/utils.py +301 -0
  790. vllm/model_executor/model_loader/weight_utils.py +781 -0
  791. vllm/model_executor/models/__init__.py +27 -0
  792. vllm/model_executor/models/adapters.py +247 -0
  793. vllm/model_executor/models/aimv2.py +199 -0
  794. vllm/model_executor/models/arctic.py +558 -0
  795. vllm/model_executor/models/aria.py +656 -0
  796. vllm/model_executor/models/aya_vision.py +461 -0
  797. vllm/model_executor/models/baichuan.py +473 -0
  798. vllm/model_executor/models/bamba.py +542 -0
  799. vllm/model_executor/models/bart.py +937 -0
  800. vllm/model_executor/models/bert.py +517 -0
  801. vllm/model_executor/models/bert_with_rope.py +714 -0
  802. vllm/model_executor/models/blip.py +338 -0
  803. vllm/model_executor/models/blip2.py +717 -0
  804. vllm/model_executor/models/bloom.py +372 -0
  805. vllm/model_executor/models/chameleon.py +1135 -0
  806. vllm/model_executor/models/chatglm.py +477 -0
  807. vllm/model_executor/models/clip.py +411 -0
  808. vllm/model_executor/models/commandr.py +471 -0
  809. vllm/model_executor/models/constant_size_cache.py +136 -0
  810. vllm/model_executor/models/dbrx.py +471 -0
  811. vllm/model_executor/models/deepseek.py +485 -0
  812. vllm/model_executor/models/deepseek_mtp.py +268 -0
  813. vllm/model_executor/models/deepseek_v2.py +842 -0
  814. vllm/model_executor/models/deepseek_vl2.py +647 -0
  815. vllm/model_executor/models/eagle.py +259 -0
  816. vllm/model_executor/models/exaone.py +550 -0
  817. vllm/model_executor/models/fairseq2_llama.py +153 -0
  818. vllm/model_executor/models/falcon.py +509 -0
  819. vllm/model_executor/models/falcon_h1.py +684 -0
  820. vllm/model_executor/models/florence2.py +1102 -0
  821. vllm/model_executor/models/fuyu.py +388 -0
  822. vllm/model_executor/models/gemma.py +424 -0
  823. vllm/model_executor/models/gemma2.py +424 -0
  824. vllm/model_executor/models/gemma3.py +532 -0
  825. vllm/model_executor/models/gemma3_mm.py +708 -0
  826. vllm/model_executor/models/glm.py +22 -0
  827. vllm/model_executor/models/glm4.py +304 -0
  828. vllm/model_executor/models/glm4v.py +647 -0
  829. vllm/model_executor/models/gpt2.py +327 -0
  830. vllm/model_executor/models/gpt_bigcode.py +334 -0
  831. vllm/model_executor/models/gpt_j.py +338 -0
  832. vllm/model_executor/models/gpt_neox.py +331 -0
  833. vllm/model_executor/models/granite.py +492 -0
  834. vllm/model_executor/models/granite_speech.py +778 -0
  835. vllm/model_executor/models/granitemoe.py +436 -0
  836. vllm/model_executor/models/granitemoehybrid.py +585 -0
  837. vllm/model_executor/models/granitemoeshared.py +340 -0
  838. vllm/model_executor/models/gritlm.py +223 -0
  839. vllm/model_executor/models/grok1.py +545 -0
  840. vllm/model_executor/models/h2ovl.py +545 -0
  841. vllm/model_executor/models/idefics2_vision_model.py +388 -0
  842. vllm/model_executor/models/idefics3.py +767 -0
  843. vllm/model_executor/models/interfaces.py +571 -0
  844. vllm/model_executor/models/interfaces_base.py +163 -0
  845. vllm/model_executor/models/intern_vit.py +475 -0
  846. vllm/model_executor/models/internlm2.py +454 -0
  847. vllm/model_executor/models/internlm2_ve.py +146 -0
  848. vllm/model_executor/models/internvl.py +1405 -0
  849. vllm/model_executor/models/jais.py +372 -0
  850. vllm/model_executor/models/jamba.py +591 -0
  851. vllm/model_executor/models/kimi_vl.py +576 -0
  852. vllm/model_executor/models/llama.py +643 -0
  853. vllm/model_executor/models/llama4.py +531 -0
  854. vllm/model_executor/models/llama_eagle.py +166 -0
  855. vllm/model_executor/models/llama_eagle3.py +257 -0
  856. vllm/model_executor/models/llava.py +865 -0
  857. vllm/model_executor/models/llava_next.py +585 -0
  858. vllm/model_executor/models/llava_next_video.py +470 -0
  859. vllm/model_executor/models/llava_onevision.py +955 -0
  860. vllm/model_executor/models/mamba.py +272 -0
  861. vllm/model_executor/models/mamba2.py +302 -0
  862. vllm/model_executor/models/mamba_cache.py +75 -0
  863. vllm/model_executor/models/medusa.py +218 -0
  864. vllm/model_executor/models/mimo.py +191 -0
  865. vllm/model_executor/models/mimo_mtp.py +284 -0
  866. vllm/model_executor/models/minicpm.py +590 -0
  867. vllm/model_executor/models/minicpm3.py +229 -0
  868. vllm/model_executor/models/minicpmo.py +758 -0
  869. vllm/model_executor/models/minicpmv.py +1286 -0
  870. vllm/model_executor/models/minimax_cache.py +35 -0
  871. vllm/model_executor/models/minimax_text_01.py +1303 -0
  872. vllm/model_executor/models/minimax_vl_01.py +363 -0
  873. vllm/model_executor/models/mistral3.py +603 -0
  874. vllm/model_executor/models/mixtral.py +487 -0
  875. vllm/model_executor/models/mixtral_quant.py +452 -0
  876. vllm/model_executor/models/mllama.py +1623 -0
  877. vllm/model_executor/models/mllama4.py +838 -0
  878. vllm/model_executor/models/mlp_speculator.py +205 -0
  879. vllm/model_executor/models/modernbert.py +329 -0
  880. vllm/model_executor/models/module_mapping.py +71 -0
  881. vllm/model_executor/models/molmo.py +1567 -0
  882. vllm/model_executor/models/moonvit.py +629 -0
  883. vllm/model_executor/models/mpt.py +330 -0
  884. vllm/model_executor/models/nemotron.py +507 -0
  885. vllm/model_executor/models/nemotron_nas.py +483 -0
  886. vllm/model_executor/models/nvlm_d.py +215 -0
  887. vllm/model_executor/models/olmo.py +388 -0
  888. vllm/model_executor/models/olmo2.py +413 -0
  889. vllm/model_executor/models/olmoe.py +446 -0
  890. vllm/model_executor/models/opt.py +411 -0
  891. vllm/model_executor/models/orion.py +348 -0
  892. vllm/model_executor/models/ovis.py +554 -0
  893. vllm/model_executor/models/paligemma.py +397 -0
  894. vllm/model_executor/models/persimmon.py +343 -0
  895. vllm/model_executor/models/phi.py +355 -0
  896. vllm/model_executor/models/phi3.py +18 -0
  897. vllm/model_executor/models/phi3_small.py +464 -0
  898. vllm/model_executor/models/phi3v.py +722 -0
  899. vllm/model_executor/models/phi4mm.py +1245 -0
  900. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  901. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  902. vllm/model_executor/models/phimoe.py +664 -0
  903. vllm/model_executor/models/pixtral.py +1315 -0
  904. vllm/model_executor/models/plamo2.py +737 -0
  905. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  906. vllm/model_executor/models/qwen.py +361 -0
  907. vllm/model_executor/models/qwen2.py +567 -0
  908. vllm/model_executor/models/qwen2_5_omni_thinker.py +903 -0
  909. vllm/model_executor/models/qwen2_5_vl.py +1171 -0
  910. vllm/model_executor/models/qwen2_audio.py +409 -0
  911. vllm/model_executor/models/qwen2_moe.py +539 -0
  912. vllm/model_executor/models/qwen2_rm.py +131 -0
  913. vllm/model_executor/models/qwen2_vl.py +1410 -0
  914. vllm/model_executor/models/qwen3.py +320 -0
  915. vllm/model_executor/models/qwen3_moe.py +534 -0
  916. vllm/model_executor/models/qwen_vl.py +784 -0
  917. vllm/model_executor/models/registry.py +618 -0
  918. vllm/model_executor/models/roberta.py +273 -0
  919. vllm/model_executor/models/siglip.py +523 -0
  920. vllm/model_executor/models/skyworkr1v.py +950 -0
  921. vllm/model_executor/models/smolvlm.py +51 -0
  922. vllm/model_executor/models/solar.py +505 -0
  923. vllm/model_executor/models/stablelm.py +342 -0
  924. vllm/model_executor/models/starcoder2.py +355 -0
  925. vllm/model_executor/models/telechat2.py +139 -0
  926. vllm/model_executor/models/teleflm.py +78 -0
  927. vllm/model_executor/models/transformers.py +507 -0
  928. vllm/model_executor/models/ultravox.py +655 -0
  929. vllm/model_executor/models/utils.py +730 -0
  930. vllm/model_executor/models/vision.py +146 -0
  931. vllm/model_executor/models/whisper.py +746 -0
  932. vllm/model_executor/models/zamba2.py +1008 -0
  933. vllm/model_executor/parameter.py +458 -0
  934. vllm/model_executor/pooling_metadata.py +71 -0
  935. vllm/model_executor/sampling_metadata.py +596 -0
  936. vllm/model_executor/utils.py +53 -0
  937. vllm/multimodal/__init__.py +32 -0
  938. vllm/multimodal/audio.py +105 -0
  939. vllm/multimodal/base.py +218 -0
  940. vllm/multimodal/hasher.py +117 -0
  941. vllm/multimodal/image.py +96 -0
  942. vllm/multimodal/inputs.py +872 -0
  943. vllm/multimodal/parse.py +460 -0
  944. vllm/multimodal/processing.py +1894 -0
  945. vllm/multimodal/profiling.py +273 -0
  946. vllm/multimodal/registry.py +330 -0
  947. vllm/multimodal/utils.py +392 -0
  948. vllm/multimodal/video.py +197 -0
  949. vllm/outputs.py +525 -0
  950. vllm/platforms/__init__.py +290 -0
  951. vllm/platforms/cpu.py +205 -0
  952. vllm/platforms/cuda.py +461 -0
  953. vllm/platforms/hpu.py +105 -0
  954. vllm/platforms/interface.py +492 -0
  955. vllm/platforms/neuron.py +152 -0
  956. vllm/platforms/rocm.py +388 -0
  957. vllm/platforms/tpu.py +215 -0
  958. vllm/platforms/xpu.py +155 -0
  959. vllm/plugins/__init__.py +86 -0
  960. vllm/plugins/lora_resolvers/README.md +15 -0
  961. vllm/plugins/lora_resolvers/__init__.py +0 -0
  962. vllm/plugins/lora_resolvers/filesystem_resolver.py +49 -0
  963. vllm/pooling_params.py +53 -0
  964. vllm/profiler/__init__.py +0 -0
  965. vllm/profiler/layerwise_profile.py +374 -0
  966. vllm/profiler/utils.py +147 -0
  967. vllm/prompt_adapter/__init__.py +0 -0
  968. vllm/prompt_adapter/layers.py +82 -0
  969. vllm/prompt_adapter/models.py +357 -0
  970. vllm/prompt_adapter/request.py +36 -0
  971. vllm/prompt_adapter/utils.py +97 -0
  972. vllm/prompt_adapter/worker_manager.py +178 -0
  973. vllm/py.typed +2 -0
  974. vllm/reasoning/__init__.py +14 -0
  975. vllm/reasoning/abs_reasoning_parsers.py +191 -0
  976. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  977. vllm/reasoning/granite_reasoning_parser.py +362 -0
  978. vllm/reasoning/qwen3_reasoning_parser.py +150 -0
  979. vllm/sampling_params.py +590 -0
  980. vllm/scalar_type.py +346 -0
  981. vllm/scripts.py +14 -0
  982. vllm/sequence.py +1567 -0
  983. vllm/spec_decode/__init__.py +0 -0
  984. vllm/spec_decode/batch_expansion.py +505 -0
  985. vllm/spec_decode/draft_model_runner.py +349 -0
  986. vllm/spec_decode/interfaces.py +98 -0
  987. vllm/spec_decode/medusa_worker.py +137 -0
  988. vllm/spec_decode/metrics.py +212 -0
  989. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  990. vllm/spec_decode/mqa_scorer.py +159 -0
  991. vllm/spec_decode/multi_step_worker.py +422 -0
  992. vllm/spec_decode/ngram_worker.py +195 -0
  993. vllm/spec_decode/proposer_worker_base.py +58 -0
  994. vllm/spec_decode/smaller_tp_proposer_worker.py +195 -0
  995. vllm/spec_decode/spec_decode_worker.py +1325 -0
  996. vllm/spec_decode/target_model_runner.py +44 -0
  997. vllm/spec_decode/top1_proposer.py +274 -0
  998. vllm/spec_decode/util.py +276 -0
  999. vllm/test_utils.py +129 -0
  1000. vllm/third_party/__init__.py +0 -0
  1001. vllm/third_party/pynvml.py +6139 -0
  1002. vllm/tracing.py +130 -0
  1003. vllm/transformers_utils/__init__.py +23 -0
  1004. vllm/transformers_utils/chat_templates/__init__.py +4 -0
  1005. vllm/transformers_utils/chat_templates/registry.py +59 -0
  1006. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1007. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1008. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1009. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1010. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1011. vllm/transformers_utils/config.py +835 -0
  1012. vllm/transformers_utils/configs/__init__.py +58 -0
  1013. vllm/transformers_utils/configs/arctic.py +206 -0
  1014. vllm/transformers_utils/configs/chatglm.py +71 -0
  1015. vllm/transformers_utils/configs/cohere2.py +194 -0
  1016. vllm/transformers_utils/configs/dbrx.py +279 -0
  1017. vllm/transformers_utils/configs/deepseek_vl2.py +215 -0
  1018. vllm/transformers_utils/configs/eagle.py +84 -0
  1019. vllm/transformers_utils/configs/exaone.py +189 -0
  1020. vllm/transformers_utils/configs/falcon.py +89 -0
  1021. vllm/transformers_utils/configs/h2ovl.py +15 -0
  1022. vllm/transformers_utils/configs/internvl.py +53 -0
  1023. vllm/transformers_utils/configs/jais.py +237 -0
  1024. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  1025. vllm/transformers_utils/configs/medusa.py +62 -0
  1026. vllm/transformers_utils/configs/minimax_text_01.py +69 -0
  1027. vllm/transformers_utils/configs/minimax_vl_01.py +70 -0
  1028. vllm/transformers_utils/configs/mllama.py +30 -0
  1029. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  1030. vllm/transformers_utils/configs/moonvit.py +32 -0
  1031. vllm/transformers_utils/configs/mpt.py +179 -0
  1032. vllm/transformers_utils/configs/nemotron.py +204 -0
  1033. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  1034. vllm/transformers_utils/configs/ovis.py +183 -0
  1035. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  1036. vllm/transformers_utils/configs/solar.py +246 -0
  1037. vllm/transformers_utils/configs/telechat2.py +63 -0
  1038. vllm/transformers_utils/configs/ultravox.py +107 -0
  1039. vllm/transformers_utils/detokenizer.py +167 -0
  1040. vllm/transformers_utils/detokenizer_utils.py +188 -0
  1041. vllm/transformers_utils/processor.py +220 -0
  1042. vllm/transformers_utils/processors/__init__.py +7 -0
  1043. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1044. vllm/transformers_utils/processors/ovis.py +419 -0
  1045. vllm/transformers_utils/s3_utils.py +161 -0
  1046. vllm/transformers_utils/tokenizer.py +301 -0
  1047. vllm/transformers_utils/tokenizer_base.py +148 -0
  1048. vllm/transformers_utils/tokenizer_group.py +119 -0
  1049. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  1050. vllm/transformers_utils/tokenizers/mistral.py +490 -0
  1051. vllm/transformers_utils/utils.py +98 -0
  1052. vllm/triton_utils/__init__.py +13 -0
  1053. vllm/triton_utils/importing.py +49 -0
  1054. vllm/usage/__init__.py +0 -0
  1055. vllm/usage/usage_lib.py +255 -0
  1056. vllm/utils.py +2844 -0
  1057. vllm/v1/__init__.py +0 -0
  1058. vllm/v1/attention/__init__.py +0 -0
  1059. vllm/v1/attention/backends/__init__.py +0 -0
  1060. vllm/v1/attention/backends/flash_attn.py +833 -0
  1061. vllm/v1/attention/backends/flashinfer.py +639 -0
  1062. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1063. vllm/v1/attention/backends/mla/common.py +926 -0
  1064. vllm/v1/attention/backends/mla/flashmla.py +150 -0
  1065. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +221 -0
  1066. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1067. vllm/v1/attention/backends/pallas.py +235 -0
  1068. vllm/v1/attention/backends/triton_attn.py +279 -0
  1069. vllm/v1/attention/backends/utils.py +18 -0
  1070. vllm/v1/core/__init__.py +0 -0
  1071. vllm/v1/core/block_pool.py +328 -0
  1072. vllm/v1/core/encoder_cache_manager.py +149 -0
  1073. vllm/v1/core/kv_cache_manager.py +372 -0
  1074. vllm/v1/core/kv_cache_utils.py +748 -0
  1075. vllm/v1/core/sched/__init__.py +0 -0
  1076. vllm/v1/core/sched/interface.py +143 -0
  1077. vllm/v1/core/sched/output.py +153 -0
  1078. vllm/v1/core/sched/scheduler.py +1015 -0
  1079. vllm/v1/core/sched/utils.py +22 -0
  1080. vllm/v1/core/single_type_kv_cache_manager.py +358 -0
  1081. vllm/v1/engine/__init__.py +171 -0
  1082. vllm/v1/engine/async_llm.py +546 -0
  1083. vllm/v1/engine/core.py +801 -0
  1084. vllm/v1/engine/core_client.py +1020 -0
  1085. vllm/v1/engine/detokenizer.py +260 -0
  1086. vllm/v1/engine/exceptions.py +16 -0
  1087. vllm/v1/engine/llm_engine.py +316 -0
  1088. vllm/v1/engine/logprobs.py +198 -0
  1089. vllm/v1/engine/mm_input_cache.py +90 -0
  1090. vllm/v1/engine/output_processor.py +427 -0
  1091. vllm/v1/engine/parallel_sampling.py +132 -0
  1092. vllm/v1/engine/processor.py +398 -0
  1093. vllm/v1/executor/__init__.py +0 -0
  1094. vllm/v1/executor/abstract.py +112 -0
  1095. vllm/v1/executor/multiproc_executor.py +532 -0
  1096. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1097. vllm/v1/kv_cache_interface.py +208 -0
  1098. vllm/v1/metrics/__init__.py +0 -0
  1099. vllm/v1/metrics/loggers.py +511 -0
  1100. vllm/v1/metrics/ray_wrappers.py +120 -0
  1101. vllm/v1/metrics/reader.py +245 -0
  1102. vllm/v1/metrics/stats.py +238 -0
  1103. vllm/v1/outputs.py +115 -0
  1104. vllm/v1/request.py +191 -0
  1105. vllm/v1/sample/__init__.py +0 -0
  1106. vllm/v1/sample/metadata.py +43 -0
  1107. vllm/v1/sample/ops/__init__.py +0 -0
  1108. vllm/v1/sample/ops/bad_words.py +38 -0
  1109. vllm/v1/sample/ops/penalties.py +58 -0
  1110. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1111. vllm/v1/sample/rejection_sampler.py +630 -0
  1112. vllm/v1/sample/sampler.py +270 -0
  1113. vllm/v1/sample/tpu/__init__.py +0 -0
  1114. vllm/v1/sample/tpu/metadata.py +123 -0
  1115. vllm/v1/sample/tpu/sampler.py +144 -0
  1116. vllm/v1/serial_utils.py +313 -0
  1117. vllm/v1/spec_decode/__init__.py +0 -0
  1118. vllm/v1/spec_decode/eagle.py +424 -0
  1119. vllm/v1/spec_decode/medusa.py +61 -0
  1120. vllm/v1/spec_decode/metadata.py +61 -0
  1121. vllm/v1/spec_decode/metrics.py +177 -0
  1122. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1123. vllm/v1/spec_decode/utils.py +45 -0
  1124. vllm/v1/structured_output/__init__.py +215 -0
  1125. vllm/v1/structured_output/backend_guidance.py +244 -0
  1126. vllm/v1/structured_output/backend_types.py +133 -0
  1127. vllm/v1/structured_output/backend_xgrammar.py +317 -0
  1128. vllm/v1/structured_output/request.py +85 -0
  1129. vllm/v1/structured_output/utils.py +174 -0
  1130. vllm/v1/utils.py +294 -0
  1131. vllm/v1/worker/__init__.py +0 -0
  1132. vllm/v1/worker/block_table.py +139 -0
  1133. vllm/v1/worker/gpu_input_batch.py +680 -0
  1134. vllm/v1/worker/gpu_model_runner.py +2084 -0
  1135. vllm/v1/worker/gpu_worker.py +373 -0
  1136. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1137. vllm/v1/worker/tpu_model_runner.py +1510 -0
  1138. vllm/v1/worker/tpu_worker.py +276 -0
  1139. vllm/v1/worker/utils.py +74 -0
  1140. vllm/v1/worker/worker_base.py +64 -0
  1141. vllm/version.py +40 -0
  1142. vllm/vllm_flash_attn/.gitkeep +0 -0
  1143. vllm/worker/__init__.py +0 -0
  1144. vllm/worker/cache_engine.py +144 -0
  1145. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1146. vllm/worker/cpu_model_runner.py +671 -0
  1147. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1148. vllm/worker/cpu_worker.py +400 -0
  1149. vllm/worker/enc_dec_model_runner.py +555 -0
  1150. vllm/worker/hpu_model_runner.py +2319 -0
  1151. vllm/worker/hpu_worker.py +483 -0
  1152. vllm/worker/model_runner.py +2178 -0
  1153. vllm/worker/model_runner_base.py +281 -0
  1154. vllm/worker/multi_step_hpu_worker.py +122 -0
  1155. vllm/worker/multi_step_model_runner.py +910 -0
  1156. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1157. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1158. vllm/worker/multi_step_tpu_worker.py +107 -0
  1159. vllm/worker/multi_step_worker.py +196 -0
  1160. vllm/worker/neuron_model_runner.py +418 -0
  1161. vllm/worker/neuron_worker.py +158 -0
  1162. vllm/worker/neuronx_distributed_model_runner.py +136 -0
  1163. vllm/worker/pooling_model_runner.py +211 -0
  1164. vllm/worker/tpu_model_runner.py +908 -0
  1165. vllm/worker/tpu_worker.py +336 -0
  1166. vllm/worker/utils.py +52 -0
  1167. vllm/worker/worker.py +574 -0
  1168. vllm/worker/worker_base.py +644 -0
  1169. vllm/worker/xpu_model_runner.py +606 -0
  1170. vllm/worker/xpu_worker.py +185 -0
  1171. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/METADATA +335 -0
  1172. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/RECORD +1175 -0
  1173. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/WHEEL +5 -0
  1174. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/entry_points.txt +5 -0
  1175. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1366 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import importlib
4
+ from abc import abstractmethod
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from typing import Callable, Optional
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torch.nn.parameter import UninitializedParameter
12
+
13
+ import vllm.envs as envs
14
+ from vllm.config import ParallelConfig, get_current_vllm_config
15
+ from vllm.distributed import (get_dp_group, get_ep_group,
16
+ get_tensor_model_parallel_rank,
17
+ get_tensor_model_parallel_world_size,
18
+ tensor_model_parallel_all_reduce)
19
+ from vllm.forward_context import ForwardContext, get_forward_context
20
+ from vllm.logger import init_logger
21
+ from vllm.model_executor.custom_op import CustomOp
22
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
23
+ is_rocm_aiter_moe_enabled)
24
+ from vllm.model_executor.layers.quantization.base_config import (
25
+ QuantizationConfig, QuantizeMethodBase)
26
+ from vllm.model_executor.utils import set_weight_attrs
27
+ from vllm.platforms import current_platform
28
+ from vllm.platforms.interface import CpuArchEnum
29
+ from vllm.utils import direct_register_custom_op
30
+
31
+ has_pplx = importlib.util.find_spec("pplx_kernels") is not None
32
+
33
+ if current_platform.is_cuda_alike():
34
+ from .fused_batched_moe import (BatchedPrepareAndFinalize,
35
+ BatchedTritonExperts)
36
+ from .fused_moe import TritonExperts, fused_experts
37
+ from .modular_kernel import (FusedMoEModularKernel,
38
+ FusedMoEPermuteExpertsUnpermute,
39
+ FusedMoEPrepareAndFinalize)
40
+ if has_pplx:
41
+ from .pplx_prepare_finalize import PplxPrepareAndFinalize
42
+ else:
43
+ fused_experts = None # type: ignore
44
+ FusedMoEPermuteExpertsUnpermute = None # type: ignore
45
+ FusedMoEPrepareAndFinalize = None # type: ignore
46
+ if is_rocm_aiter_moe_enabled():
47
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
48
+ rocm_aiter_biased_group_topk as grouped_topk)
49
+ else:
50
+ from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
51
+ if current_platform.is_tpu():
52
+ from .moe_pallas import fused_moe as fused_moe_pallas
53
+ else:
54
+ fused_moe_pallas = None # type: ignore
55
+ logger = init_logger(__name__)
56
+
57
+ # Note: this limit is somewhat arbitrary and might be changed later.
58
+ # The size of the activations will be E x MOE_DP_CHUNK_SIZE x hidden_dim.
59
+ MOE_DP_CHUNK_SIZE = 256
60
+
61
+
62
+ @dataclass
63
+ class FusedMoEParallelConfig:
64
+ tp_size: int
65
+ dp_size: int
66
+ ep_size: int
67
+ tp_rank: int
68
+ dp_rank: int
69
+ ep_rank: int
70
+
71
+ use_ep: bool # whether to use EP or not
72
+
73
+ @property
74
+ def use_pplx_kernels(self):
75
+ return self.dp_size > 1 and self.use_ep and \
76
+ envs.VLLM_ALL2ALL_BACKEND == "pplx"
77
+
78
+ @staticmethod
79
+ def make(tp_size_: int, dp_size_: int,
80
+ vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
81
+ """
82
+ Determine MoE parallel configuration. Based on the input tp_size_,
83
+ dp_size_, ep_size_ and vllm's parallel config, determine what
84
+ level's of parallelism to use in the fused moe layer.
85
+
86
+ Args:
87
+ tp_size_ (int): tp_size passed into the FusedMoE constructor.
88
+ dp_size_ (int): dp_size passed into the FusedMoE constructor.
89
+ ep_size_ (int): ep_size passed into the FusedMoE constructor.
90
+ vllm_parallel_config (ParallelConfig): vllm's parallel config
91
+ object.
92
+
93
+ Examples:
94
+ When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1,
95
+ we simply return the sizes unaltered and the ranks set to 0.
96
+
97
+ Expert Parallelism is considered only when either dp_size_ or tp_size_
98
+ is non trivial.
99
+
100
+ When TP = 2, DP = 1 and EP = False, the configuration on different
101
+ devices,
102
+ - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
103
+ legend : {size, rank}
104
+ - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
105
+ - Comment : Tensors are sharded across 2 devices.
106
+
107
+ When TP = 1, DP = 2 and EP = False, the configuration on different
108
+ devices,
109
+ - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
110
+ - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
111
+ - Comment: There are 2 engine instances and the tensors are sharded
112
+ across 2 decvices.
113
+
114
+ When TP = 2, DP = 2 and EP = False, the configuration on different
115
+ devices,
116
+ - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
117
+ - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
118
+ - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
119
+ - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
120
+ - Comment: There are 2 engine instances and the tensors are sharded
121
+ across 4 devices.
122
+
123
+ When, TP = 2, DP = 1 and EP = True, the configuration on different
124
+ devices,
125
+ - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
126
+ - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
127
+ - Comment: The experts are split between the 2 devices.
128
+
129
+ When, TP = 1, DP = 2 and EP = True, the configuration on different
130
+ devices,
131
+ - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
132
+ - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
133
+ - Comment: There are 2 engine instances and the experts are split
134
+ between the 2 devices.
135
+
136
+ When TP = 2, DP = 2 and EP = True, the configuration on different
137
+ devices,
138
+ - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
139
+ - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
140
+ - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
141
+ - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
142
+ - Comment: There are 2 engine instances and the experts are split
143
+ between the 4 devices.
144
+ """
145
+
146
+ def flatten_tp_across_dp(dp_rank: int):
147
+ tp_rank = 0 if tp_size_ == 1 else get_tensor_model_parallel_rank()
148
+ # There are actually dp_size_ * tp_size_ devices. Update tp_size
149
+ # and tp_rank so we shard across all devices.
150
+ tp_size = dp_size_ * tp_size_
151
+ tp_rank = dp_rank * tp_size_ + tp_rank
152
+ return tp_size, tp_rank
153
+
154
+ use_ep = (dp_size_ * tp_size_ > 1
155
+ and vllm_parallel_config.enable_expert_parallel)
156
+
157
+ dp_size = dp_size_
158
+ dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
159
+ tp_size, tp_rank = flatten_tp_across_dp(dp_rank)
160
+
161
+ if not use_ep:
162
+ return FusedMoEParallelConfig(tp_size=tp_size,
163
+ tp_rank=tp_rank,
164
+ dp_size=dp_size,
165
+ dp_rank=dp_rank,
166
+ ep_size=1,
167
+ ep_rank=0,
168
+ use_ep=False)
169
+ # DP + EP / TP + EP / DP + TP + EP
170
+ assert use_ep
171
+ # In EP, each device owns a set of experts fully. There is no tensor
172
+ # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
173
+ ep_size = tp_size
174
+ ep_rank = tp_rank
175
+ return FusedMoEParallelConfig(tp_size=1,
176
+ tp_rank=0,
177
+ dp_size=dp_size,
178
+ dp_rank=dp_rank,
179
+ ep_size=ep_size,
180
+ ep_rank=ep_rank,
181
+ use_ep=True)
182
+
183
+
184
+ # Adapted from pplx-kernels tests/all_to_all_utils.py
185
+ @dataclass
186
+ class MoEConfig:
187
+ num_experts: int
188
+ experts_per_token: int
189
+ hidden_dim: int
190
+
191
+ num_local_experts: int
192
+ moe_parallel_config: FusedMoEParallelConfig
193
+
194
+ in_dtype: torch.dtype # The activation type.
195
+
196
+ # TODO: add more quantization params, blocked, per-token, etc.
197
+ block_size: int = 128
198
+
199
+ max_num_tokens: int = MOE_DP_CHUNK_SIZE
200
+
201
+ @property
202
+ def tp_size(self):
203
+ return self.moe_parallel_config.tp_size
204
+
205
+ @property
206
+ def dp_size(self):
207
+ return self.moe_parallel_config.dp_size
208
+
209
+ @property
210
+ def ep_size(self):
211
+ return self.moe_parallel_config.ep_size
212
+
213
+ @property
214
+ def tp_rank(self):
215
+ return self.moe_parallel_config.tp_rank
216
+
217
+ @property
218
+ def dp_rank(self):
219
+ return self.moe_parallel_config.dp_rank
220
+
221
+ @property
222
+ def ep_rank(self):
223
+ return self.moe_parallel_config.ep_rank
224
+
225
+ @property
226
+ def use_ep(self):
227
+ return self.moe_parallel_config.use_ep
228
+
229
+ @property
230
+ def use_pplx_kernels(self):
231
+ return self.moe_parallel_config.use_pplx_kernels
232
+
233
+
234
+ class FusedMoeWeightScaleSupported(Enum):
235
+ TENSOR = "tensor"
236
+ CHANNEL = "channel"
237
+ GROUP = "group"
238
+ BLOCK = "block"
239
+
240
+
241
+ class FusedMoEMethodBase(QuantizeMethodBase):
242
+
243
+ @abstractmethod
244
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
245
+ hidden_size: int, intermediate_size_per_partition: int,
246
+ params_dtype: torch.dtype, **extra_weight_attrs):
247
+ raise NotImplementedError
248
+
249
+ def init_prepare_finalize(self, moe: MoEConfig,
250
+ quant_config: Optional[QuantizationConfig]):
251
+ all2all_manager = get_ep_group().device_communicator.all2all_manager
252
+ assert all2all_manager is not None
253
+
254
+ prepare_finalize = None
255
+ if moe.use_pplx_kernels:
256
+ all_to_all_args = dict(
257
+ max_num_tokens=moe.max_num_tokens,
258
+ num_experts=moe.num_experts,
259
+ experts_per_token=moe.experts_per_token, # topk
260
+ rank=all2all_manager.rank,
261
+ world_size=all2all_manager.world_size,
262
+ # dp_size actually means tp_size, bug in pplx kernels
263
+ dp_size=all2all_manager.tp_group.world_size,
264
+ hidden_dim=moe.hidden_dim,
265
+ hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
266
+ # For blocked per token: set to
267
+ # ceil_div(hidden_dim, block_size) * sizeof(float32)
268
+ # For per-token: set to sizeof(float32)
269
+ hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
270
+ (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
271
+ torch.float32.itemsize)),
272
+ group_name=all2all_manager.cpu_group.group_name,
273
+ )
274
+
275
+ handle = all2all_manager.get_handle(all_to_all_args)
276
+
277
+ prepare_finalize = PplxPrepareAndFinalize(
278
+ handle,
279
+ max_num_tokens=moe.max_num_tokens,
280
+ world_size=all2all_manager.world_size,
281
+ rank=all2all_manager.rank,
282
+ # dp_size actually means tp_size, bug in pplx kernels
283
+ dp_size=all2all_manager.tp_group.world_size,
284
+ quant_dtype=moe.in_dtype,
285
+ )
286
+
287
+ if prepare_finalize is not None:
288
+ experts = self.select_gemm_impl(prepare_finalize)
289
+ self.fused_experts = FusedMoEModularKernel(
290
+ prepare_finalize,
291
+ experts,
292
+ )
293
+
294
+ def select_gemm_impl(
295
+ self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]
296
+ ) -> FusedMoEPermuteExpertsUnpermute:
297
+ # based on the all2all implementation, select the appropriate
298
+ # gemm implementation
299
+ raise NotImplementedError(
300
+ "Subclass must select appropriate gemm implementation"
301
+ " based on the prepare_finalize")
302
+
303
+ @abstractmethod
304
+ def apply(
305
+ self,
306
+ layer: torch.nn.Module,
307
+ x: torch.Tensor,
308
+ router_logits: torch.Tensor,
309
+ top_k: int,
310
+ renormalize: bool,
311
+ use_grouped_topk: bool = False,
312
+ topk_group: Optional[int] = None,
313
+ num_expert_group: Optional[int] = None,
314
+ global_num_experts: int = -1,
315
+ expert_map: Optional[torch.Tensor] = None,
316
+ custom_routing_function: Optional[Callable] = None,
317
+ scoring_func: str = "softmax",
318
+ e_score_correction_bias: Optional[torch.Tensor] = None,
319
+ apply_router_weight_on_input: bool = False,
320
+ activation: str = "silu",
321
+ ) -> torch.Tensor:
322
+ raise NotImplementedError
323
+
324
+
325
+ @CustomOp.register("unquantized_fused_moe")
326
+ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
327
+ """MoE method without quantization."""
328
+
329
+ def __init__(self, moe: MoEConfig):
330
+ super().__init__()
331
+ self.fused_experts = fused_experts # type: ignore
332
+ self.moe = moe
333
+
334
+ self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
335
+ if self.rocm_aiter_moe_enabled:
336
+ from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
337
+ self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
338
+ else:
339
+ self.rocm_aiter_fused_experts = None # type: ignore
340
+
341
+ def select_gemm_impl(
342
+ self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]):
343
+
344
+ assert self.fused_experts == fused_experts
345
+
346
+ all2all_manager = get_ep_group().device_communicator.all2all_manager
347
+ assert all2all_manager is not None
348
+
349
+ experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
350
+
351
+ if isinstance(prepare_finalize,
352
+ (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
353
+ logger.debug("BatchedTritonExperts %s", self.moe)
354
+ experts = BatchedTritonExperts(
355
+ max_num_tokens=MOE_DP_CHUNK_SIZE,
356
+ world_size=all2all_manager.world_size,
357
+ # dp_size actually means tp_size, bug in pplx kernels
358
+ dp_size=all2all_manager.tp_group.world_size,
359
+ use_fp8_w8a8=False,
360
+ use_int8_w8a8=False,
361
+ use_int8_w8a16=False,
362
+ use_int4_w4a16=False,
363
+ block_shape=None,
364
+ )
365
+ else:
366
+ logger.debug("TritonExperts %s", self.moe)
367
+ experts = TritonExperts(
368
+ use_fp8_w8a8=False,
369
+ use_int8_w8a8=False,
370
+ use_int8_w8a16=False,
371
+ use_int4_w4a16=False,
372
+ block_shape=None,
373
+ per_channel_quant=False,
374
+ )
375
+ return experts
376
+
377
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
378
+ hidden_size: int, intermediate_size_per_partition: int,
379
+ params_dtype: torch.dtype, **extra_weight_attrs):
380
+ # Fused gate_up_proj (column parallel)
381
+ w13_weight = torch.nn.Parameter(torch.empty(
382
+ num_experts,
383
+ 2 * intermediate_size_per_partition,
384
+ hidden_size,
385
+ dtype=params_dtype),
386
+ requires_grad=False)
387
+ layer.register_parameter("w13_weight", w13_weight)
388
+ set_weight_attrs(w13_weight, extra_weight_attrs)
389
+
390
+ # down_proj (row parallel)
391
+ w2_weight = torch.nn.Parameter(torch.empty(
392
+ num_experts,
393
+ hidden_size,
394
+ intermediate_size_per_partition,
395
+ dtype=params_dtype),
396
+ requires_grad=False)
397
+ layer.register_parameter("w2_weight", w2_weight)
398
+ set_weight_attrs(w2_weight, extra_weight_attrs)
399
+
400
+ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
401
+ # Pad the weight tensor. This is an optimization on ROCm platform, which
402
+ # can benefit from tensors located far enough from one another in memory
403
+ if (envs.VLLM_ROCM_MOE_PADDING and current_platform.is_rocm()
404
+ and weight.stride(-1) == 1
405
+ and (weight.stride(-2) * weight.element_size()) % 512 == 0):
406
+ num_pad = 256 // weight.element_size()
407
+ weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
408
+ torch.cuda.empty_cache()
409
+ return weight
410
+
411
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
412
+ super().process_weights_after_loading(layer)
413
+
414
+ # Padding the weight for better performance on ROCm
415
+ layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
416
+ layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
417
+ # Lazy import to avoid importing triton.
418
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
419
+ shuffle_weights)
420
+
421
+ if self.rocm_aiter_moe_enabled:
422
+ shuffled_w13, shuffled_w2 = shuffle_weights(
423
+ layer.w13_weight.data, layer.w2_weight.data)
424
+
425
+ layer.w13_weight.data = shuffled_w13
426
+ layer.w2_weight.data = shuffled_w2
427
+
428
+ if current_platform.is_cpu():
429
+ if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
430
+ import intel_extension_for_pytorch as ipex
431
+ layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
432
+ layer.w13_weight,
433
+ layer.w2_weight,
434
+ use_prepack=envs.VLLM_CPU_MOE_PREPACK,
435
+ )
436
+ else:
437
+ raise NotImplementedError("CPU MOE only supports x86 arch.")
438
+
439
+ def apply(
440
+ self,
441
+ layer: torch.nn.Module,
442
+ x: torch.Tensor,
443
+ router_logits: torch.Tensor,
444
+ top_k: int,
445
+ renormalize: bool,
446
+ use_grouped_topk: bool = False,
447
+ topk_group: Optional[int] = None,
448
+ num_expert_group: Optional[int] = None,
449
+ global_num_experts: int = -1,
450
+ expert_map: Optional[torch.Tensor] = None,
451
+ custom_routing_function: Optional[Callable] = None,
452
+ scoring_func: str = "softmax",
453
+ e_score_correction_bias: Optional[torch.Tensor] = None,
454
+ apply_router_weight_on_input: bool = False,
455
+ activation: str = "silu",
456
+ ) -> torch.Tensor:
457
+ return self.forward(
458
+ x=x,
459
+ layer=layer,
460
+ router_logits=router_logits,
461
+ top_k=top_k,
462
+ renormalize=renormalize,
463
+ use_grouped_topk=use_grouped_topk,
464
+ topk_group=topk_group,
465
+ num_expert_group=num_expert_group,
466
+ global_num_experts=global_num_experts,
467
+ expert_map=expert_map,
468
+ custom_routing_function=custom_routing_function,
469
+ scoring_func=scoring_func,
470
+ e_score_correction_bias=e_score_correction_bias,
471
+ activation=activation,
472
+ apply_router_weight_on_input=apply_router_weight_on_input)
473
+
474
+ def forward_cuda(
475
+ self,
476
+ layer: torch.nn.Module,
477
+ x: torch.Tensor,
478
+ use_grouped_topk: bool,
479
+ top_k: int,
480
+ router_logits: torch.Tensor,
481
+ renormalize: bool,
482
+ topk_group: Optional[int] = None,
483
+ num_expert_group: Optional[int] = None,
484
+ global_num_experts: int = -1,
485
+ expert_map: Optional[torch.Tensor] = None,
486
+ custom_routing_function: Optional[Callable] = None,
487
+ scoring_func: str = "softmax",
488
+ e_score_correction_bias: Optional[torch.Tensor] = None,
489
+ apply_router_weight_on_input: bool = False,
490
+ activation: str = "silu",
491
+ ) -> torch.Tensor:
492
+ topk_weights, topk_ids = FusedMoE.select_experts(
493
+ hidden_states=x,
494
+ router_logits=router_logits,
495
+ use_grouped_topk=use_grouped_topk,
496
+ top_k=top_k,
497
+ renormalize=renormalize,
498
+ topk_group=topk_group,
499
+ num_expert_group=num_expert_group,
500
+ custom_routing_function=custom_routing_function,
501
+ scoring_func=scoring_func,
502
+ e_score_correction_bias=e_score_correction_bias,
503
+ indices_type=torch.uint32 if self.moe.use_pplx_kernels else None)
504
+
505
+ if self.rocm_aiter_moe_enabled:
506
+ assert expert_map is None
507
+ return self.rocm_aiter_fused_experts(
508
+ hidden_states=x,
509
+ w1=layer.w13_weight,
510
+ w2=layer.w2_weight,
511
+ topk_weights=topk_weights,
512
+ topk_ids=topk_ids,
513
+ activation=activation,
514
+ apply_router_weight_on_input=apply_router_weight_on_input)
515
+ else:
516
+ return self.fused_experts(
517
+ hidden_states=x,
518
+ w1=layer.w13_weight,
519
+ w2=layer.w2_weight,
520
+ topk_weights=topk_weights,
521
+ topk_ids=topk_ids,
522
+ inplace=True,
523
+ activation=activation,
524
+ apply_router_weight_on_input=apply_router_weight_on_input,
525
+ global_num_experts=global_num_experts,
526
+ expert_map=expert_map,
527
+ )
528
+
529
+ def forward_cpu(
530
+ self,
531
+ layer: torch.nn.Module,
532
+ x: torch.Tensor,
533
+ use_grouped_topk: bool,
534
+ top_k: int,
535
+ router_logits: torch.Tensor,
536
+ renormalize: bool,
537
+ topk_group: Optional[int] = None,
538
+ num_expert_group: Optional[int] = None,
539
+ global_num_experts: int = -1,
540
+ expert_map: Optional[torch.Tensor] = None,
541
+ custom_routing_function: Optional[Callable] = None,
542
+ scoring_func: str = "softmax",
543
+ e_score_correction_bias: Optional[torch.Tensor] = None,
544
+ activation: str = "silu",
545
+ apply_router_weight_on_input: bool = False,
546
+ **kwargs,
547
+ ):
548
+ assert activation == "silu", f"{activation} is not supported."
549
+ assert apply_router_weight_on_input is False
550
+ return layer.ipex_fusion(
551
+ x,
552
+ use_grouped_topk,
553
+ top_k,
554
+ router_logits,
555
+ renormalize,
556
+ topk_group,
557
+ num_expert_group,
558
+ custom_routing_function,
559
+ scoring_func,
560
+ e_score_correction_bias,
561
+ )
562
+
563
+ def forward_hpu(
564
+ self,
565
+ layer: torch.nn.Module,
566
+ x: torch.Tensor,
567
+ use_grouped_topk: bool,
568
+ top_k: int,
569
+ router_logits: torch.Tensor,
570
+ renormalize: bool,
571
+ topk_group: Optional[int] = None,
572
+ num_expert_group: Optional[int] = None,
573
+ global_num_experts: int = -1,
574
+ expert_map: Optional[torch.Tensor] = None,
575
+ custom_routing_function: Optional[Callable] = None,
576
+ scoring_func: str = "softmax",
577
+ e_score_correction_bias: Optional[torch.Tensor] = None,
578
+ apply_router_weight_on_input: bool = False,
579
+ activation: str = "silu",
580
+ ) -> torch.Tensor:
581
+ assert not use_grouped_topk
582
+ assert num_expert_group is None
583
+ assert topk_group is None
584
+ assert custom_routing_function is None
585
+ assert layer is not None
586
+ assert apply_router_weight_on_input is False
587
+ if scoring_func != "softmax":
588
+ raise NotImplementedError(
589
+ "Only softmax scoring function is supported for HPU.")
590
+ if e_score_correction_bias is not None:
591
+ raise NotImplementedError(
592
+ "Expert score correction bias is not supported for HPU.")
593
+ return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
594
+ router_logits, top_k)
595
+
596
+ def forward_tpu(
597
+ self,
598
+ layer: torch.nn.Module,
599
+ x: torch.Tensor,
600
+ use_grouped_topk: bool,
601
+ top_k: int,
602
+ router_logits: torch.Tensor,
603
+ renormalize: bool,
604
+ topk_group: Optional[int] = None,
605
+ num_expert_group: Optional[int] = None,
606
+ global_num_experts: int = -1,
607
+ expert_map: Optional[torch.Tensor] = None,
608
+ custom_routing_function: Optional[Callable] = None,
609
+ scoring_func: str = "softmax",
610
+ e_score_correction_bias: Optional[torch.Tensor] = None,
611
+ apply_router_weight_on_input: bool = False,
612
+ activation: str = "silu",
613
+ ) -> torch.Tensor:
614
+ assert not use_grouped_topk
615
+ assert num_expert_group is None
616
+ assert topk_group is None
617
+ assert custom_routing_function is None
618
+ assert apply_router_weight_on_input is False
619
+ if scoring_func != "softmax":
620
+ raise NotImplementedError(
621
+ "Only softmax scoring function is supported for TPU.")
622
+ if e_score_correction_bias is not None:
623
+ raise NotImplementedError(
624
+ "Expert score correction bias is not supported for TPU.")
625
+ assert activation == "silu", f"{activation} is not supported for TPU."
626
+ return fused_moe_pallas(hidden_states=x,
627
+ w1=layer.w13_weight,
628
+ w2=layer.w2_weight,
629
+ topk=top_k,
630
+ gating_output=router_logits,
631
+ global_num_experts=global_num_experts,
632
+ expert_map=expert_map,
633
+ renormalize=renormalize)
634
+
635
+ forward_native = forward_tpu if current_platform.is_tpu() else forward_cuda
636
+
637
+
638
+ def determine_expert_map(
639
+ ep_size: int, ep_rank: int,
640
+ global_num_experts: int) -> tuple[int, Optional[torch.Tensor]]:
641
+ """
642
+ Calculates how many experts should be assigned to each rank for EP and
643
+ creates a mapping from global to local expert index. Experts are
644
+ distributed evenly across ranks. Any remaining are assigned to the
645
+ last rank.
646
+
647
+ Args:
648
+ ep_size (int): The size of the expert parallel group
649
+ global_num_experts (int): The total number of experts in the model.
650
+
651
+ Returns:
652
+ tuple[int, Optional[torch.Tensor]]: A tuple containing:
653
+ - local_num_experts (int): The number of experts assigned
654
+ to the current rank.
655
+ - expert_map (Optional[torch.Tensor]): A tensor of shape
656
+ (global_num_experts,) mapping from global to local index.
657
+ Contains -1 for experts not assigned to the current rank.
658
+ Returns None if ep_size is 1.
659
+ """
660
+ assert ep_size > 0
661
+ if ep_size == 1:
662
+ return (global_num_experts, None)
663
+
664
+ local_num_experts = global_num_experts // ep_size
665
+
666
+ # Create a tensor of size num_experts filled with -1
667
+ expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
668
+ # Create a expert map for the local experts
669
+ if ep_rank < (ep_size - 1):
670
+ # Each non-last rank gets local_num_experts experts.
671
+ expert_map[ep_rank * local_num_experts:
672
+ (ep_rank + 1) * local_num_experts] = \
673
+ torch.arange(0, local_num_experts, dtype=torch.int32)
674
+ else:
675
+ # All remaining experts are assigned to the last rank.
676
+ local_num_experts = (global_num_experts - ep_rank * local_num_experts)
677
+
678
+ expert_map[-local_num_experts:] = \
679
+ torch.arange(0, local_num_experts, dtype=torch.int32)
680
+ return (local_num_experts, expert_map)
681
+
682
+
683
+ class FusedMoE(torch.nn.Module):
684
+ """FusedMoE layer for MoE models.
685
+
686
+ This layer contains both MergedColumnParallel weights (gate_up_proj /
687
+ w13) and RowParallelLinear weights (down_proj/ w2).
688
+
689
+ Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
690
+ copy that naming convention here and handle any remapping in the
691
+ load_weights function in each model implementation.
692
+
693
+ Args:
694
+ num_experts: Number of experts in the model
695
+ top_k: Number of experts selected for each token
696
+ hidden_size: Input hidden state size of the transformer
697
+ intermediate_size: Intermediate size of the experts
698
+ params_dtype: Data type for the parameters.
699
+ reduce_results: Whether to all all_reduce on the output of the layer
700
+ renomalize: Whether to renormalize the logits in the fused_moe kernel
701
+ quant_config: Quantization configure.
702
+ """
703
+
704
+ def __init__(
705
+ self,
706
+ num_experts: int, # Global number of experts
707
+ top_k: int,
708
+ hidden_size: int,
709
+ intermediate_size: int,
710
+ params_dtype: Optional[torch.dtype] = None,
711
+ reduce_results: bool = False,
712
+ renormalize: bool = True,
713
+ use_grouped_topk: bool = False,
714
+ num_expert_group: Optional[int] = None,
715
+ topk_group: Optional[int] = None,
716
+ quant_config: Optional[QuantizationConfig] = None,
717
+ tp_size: Optional[int] = None,
718
+ ep_size: Optional[int] = None,
719
+ dp_size: Optional[int] = None,
720
+ prefix: str = "",
721
+ custom_routing_function: Optional[Callable] = None,
722
+ scoring_func: str = "softmax",
723
+ e_score_correction_bias: Optional[torch.Tensor] = None,
724
+ apply_router_weight_on_input: bool = False,
725
+ activation: str = "silu",
726
+ ):
727
+ super().__init__()
728
+
729
+ if params_dtype is None:
730
+ params_dtype = torch.get_default_dtype()
731
+ self.params_dtype = params_dtype
732
+
733
+ vllm_config = get_current_vllm_config()
734
+ self.moe_parallel_config: FusedMoEParallelConfig = (
735
+ FusedMoEParallelConfig.make(
736
+ tp_size_=(tp_size if tp_size is not None else
737
+ get_tensor_model_parallel_world_size()),
738
+ dp_size_=(dp_size if dp_size is not None else
739
+ get_dp_group().world_size),
740
+ vllm_parallel_config=vllm_config.parallel_config))
741
+
742
+ self.global_num_experts = num_experts
743
+
744
+ # For smuggling this layer into the fused moe custom op
745
+ self.use_direct_call = self.dp_size == 1
746
+ if not self.use_direct_call:
747
+ compilation_config = vllm_config.compilation_config
748
+ if prefix in compilation_config.static_forward_context:
749
+ raise ValueError("Duplicate layer name: {}".format(prefix))
750
+ compilation_config.static_forward_context[prefix] = self
751
+ self.layer_name = prefix
752
+
753
+ # Determine expert maps
754
+ if self.use_ep:
755
+ self.local_num_experts, self.expert_map = determine_expert_map(
756
+ ep_size=self.ep_size,
757
+ ep_rank=self.ep_rank,
758
+ global_num_experts=self.global_num_experts)
759
+ else:
760
+ self.local_num_experts, self.expert_map = (self.global_num_experts,
761
+ None)
762
+
763
+ self.top_k = top_k
764
+
765
+ assert intermediate_size % self.tp_size == 0
766
+ self.hidden_size = hidden_size
767
+ self.intermediate_size_per_partition = intermediate_size // self.tp_size
768
+ self.reduce_results = reduce_results
769
+ self.renormalize = renormalize
770
+ self.use_grouped_topk = use_grouped_topk
771
+ if self.use_grouped_topk:
772
+ assert num_expert_group is not None and topk_group is not None
773
+ self.num_expert_group = num_expert_group
774
+ self.topk_group = topk_group
775
+ self.custom_routing_function = custom_routing_function
776
+ self.scoring_func = scoring_func
777
+ self.e_score_correction_bias = e_score_correction_bias
778
+ self.apply_router_weight_on_input = apply_router_weight_on_input
779
+ self.activation = activation
780
+
781
+ if self.scoring_func != "softmax" and not self.use_grouped_topk:
782
+ raise ValueError("Only softmax scoring function is supported for "
783
+ "non-grouped topk.")
784
+ if current_platform.is_hpu():
785
+ from vllm_hpu_extension.ops import DynamicFusedMOE
786
+ self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
787
+
788
+ moe = MoEConfig(
789
+ num_experts=self.global_num_experts,
790
+ experts_per_token=top_k,
791
+ hidden_dim=hidden_size,
792
+ num_local_experts=self.local_num_experts,
793
+ moe_parallel_config=self.moe_parallel_config,
794
+ # TODO (bnell): this needs to be fixed for quantized types.
795
+ in_dtype=params_dtype,
796
+ max_num_tokens=MOE_DP_CHUNK_SIZE,
797
+ )
798
+ self.moe_config = moe
799
+ self.quant_config = quant_config
800
+
801
+ # Note: get_quant_method will look at the layer's local_num_experts
802
+ # for heuristic purposes, so it must be initialized first.
803
+ quant_method: Optional[QuantizeMethodBase] = None
804
+
805
+ if quant_config is None:
806
+ quant_method = UnquantizedFusedMoEMethod(moe)
807
+ else:
808
+ quant_method = quant_config.get_quant_method(self, prefix)
809
+
810
+ assert quant_method is not None
811
+ assert isinstance(quant_method, FusedMoEMethodBase)
812
+ self.quant_method = quant_method
813
+
814
+ moe_quant_params = {
815
+ "num_experts": self.local_num_experts,
816
+ "hidden_size": hidden_size,
817
+ "intermediate_size_per_partition":
818
+ self.intermediate_size_per_partition,
819
+ "params_dtype": params_dtype,
820
+ "weight_loader": self.weight_loader,
821
+ }
822
+ # need full intermediate size pre-sharding for WNA16 act order
823
+ if (self.quant_method.__class__.__name__
824
+ in ("GPTQMarlinMoEMethod",
825
+ "CompressedTensorsWNA16MarlinMoEMethod",
826
+ "CompressedTensorsWNA16MoEMethod")):
827
+ moe_quant_params["intermediate_size_full"] = intermediate_size
828
+
829
+ self.quant_method.create_weights(layer=self, **moe_quant_params)
830
+
831
+ @property
832
+ def tp_size(self):
833
+ return self.moe_parallel_config.tp_size
834
+
835
+ @property
836
+ def dp_size(self):
837
+ return self.moe_parallel_config.dp_size
838
+
839
+ @property
840
+ def ep_size(self):
841
+ return self.moe_parallel_config.ep_size
842
+
843
+ @property
844
+ def tp_rank(self):
845
+ return self.moe_parallel_config.tp_rank
846
+
847
+ @property
848
+ def dp_rank(self):
849
+ return self.moe_parallel_config.dp_rank
850
+
851
+ @property
852
+ def ep_rank(self):
853
+ return self.moe_parallel_config.ep_rank
854
+
855
+ @property
856
+ def use_ep(self):
857
+ return self.moe_parallel_config.use_ep
858
+
859
+ @property
860
+ def use_pplx_kernels(self):
861
+ return self.moe_parallel_config.use_pplx_kernels
862
+
863
+ def _load_per_tensor_weight_scale(self, shard_id: str,
864
+ param: torch.nn.Parameter,
865
+ loaded_weight: torch.Tensor,
866
+ expert_id: int):
867
+ param_data = param.data
868
+ # for per tensor weight quantization
869
+ if shard_id in ("w1", "w3"):
870
+ # We have to keep the weight scales of w1 and w3 because
871
+ # we need to re-quantize w1/w3 weights after weight loading.
872
+ idx = 0 if shard_id == "w1" else 1
873
+ param_data[expert_id][idx] = loaded_weight
874
+ # If we are in the row parallel case (down_proj)
875
+ elif shard_id == "w2":
876
+ param_data[expert_id] = loaded_weight
877
+
878
+ def _load_model_weight_or_group_weight_scale(self,
879
+ shard_dim: int,
880
+ expert_data: torch.Tensor,
881
+ shard_id: str,
882
+ loaded_weight: torch.Tensor,
883
+ tp_rank: int,
884
+ load_full_w2: bool = False):
885
+ """
886
+ Load grouped weight scales for group quantization or model weights
887
+ :param shard_dim: dimension to shard
888
+ :param expert_data: parameter for a particular expert
889
+ :param shard_id: either w1, w2, or w3
890
+ :param loaded_weight: checkpoint weight to load into the param
891
+ :param tp_rank: tensor parallel rank
892
+ :param load_full_w2: whether or not the w2 loaded should be sharded.
893
+ """
894
+ if shard_id == "w2":
895
+ # In the case where we have actorder/g_idx, we do not partition the
896
+ # w2 scales, as indicated by `load_full` argument, for all tp cases
897
+ self._load_w2(shard_dim=shard_dim,
898
+ loaded_weight=loaded_weight,
899
+ expert_data=expert_data,
900
+ tp_rank=tp_rank,
901
+ load_full=load_full_w2)
902
+ elif shard_id in ("w1", "w3"):
903
+ self._load_w13(shard_id=shard_id,
904
+ shard_dim=shard_dim,
905
+ loaded_weight=loaded_weight,
906
+ expert_data=expert_data,
907
+ tp_rank=tp_rank)
908
+
909
+ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
910
+ shard_dim: int, shard_id: str,
911
+ loaded_weight: torch.Tensor,
912
+ tp_rank: int):
913
+ # for per channel weight quantization
914
+ if shard_id == "w2":
915
+ expert_data.copy_(loaded_weight)
916
+ elif shard_id in ("w1", "w3"):
917
+ self._load_w13(shard_id=shard_id,
918
+ shard_dim=shard_dim,
919
+ loaded_weight=loaded_weight,
920
+ expert_data=expert_data,
921
+ tp_rank=tp_rank)
922
+
923
+ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
924
+ shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
925
+
926
+ # Index the loaded weight for tp sharding.
927
+ # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
928
+ shard_size = expert_data.shape[shard_dim] // 2
929
+ loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
930
+ shard_size)
931
+ # Narrow parameter and load.
932
+ # w1, gate_proj: Load into first logical weight of w13.
933
+ if shard_id == "w1":
934
+ expert_data = expert_data.narrow(shard_dim, 0, shard_size)
935
+ # w3, up_proj: Load into second logical weight of w13.
936
+ else:
937
+ assert shard_id == "w3"
938
+ expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
939
+ expert_data.copy_(loaded_weight)
940
+
941
+ def _load_w2(self,
942
+ expert_data: torch.Tensor,
943
+ shard_dim: int,
944
+ loaded_weight: torch.Tensor,
945
+ tp_rank: int,
946
+ load_full: bool = False):
947
+
948
+ # Index the loaded weight for tp sharding.
949
+ # down_proj: "RowParallel" so tp sharding on input_dim
950
+ # Narrow parameter and load.
951
+ shard_size = expert_data.shape[shard_dim]
952
+ if not load_full:
953
+ loaded_weight = loaded_weight.narrow(shard_dim,
954
+ shard_size * tp_rank,
955
+ shard_size)
956
+ # w2, down_proj: Load into only logical weight of w2.
957
+ expert_data.copy_(loaded_weight)
958
+
959
+ def _load_single_value(self, param: torch.nn.Parameter,
960
+ loaded_weight: torch.Tensor, expert_id: int):
961
+ param_data = param.data
962
+
963
+ # Input scales can be loaded directly and should be equal.
964
+ param_data[expert_id] = loaded_weight
965
+
966
+ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
967
+ shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
968
+
969
+ if shard_id == "w2":
970
+ self._load_w2(shard_dim=shard_dim,
971
+ loaded_weight=loaded_weight,
972
+ expert_data=expert_data,
973
+ tp_rank=tp_rank)
974
+ else:
975
+ assert shard_id in ("w1", "w3")
976
+ expert_data.copy_(loaded_weight)
977
+
978
+ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
979
+ if self.expert_map is None:
980
+ return expert_id
981
+ return self.expert_map[expert_id].item()
982
+
983
+ def weight_loader(self, param: torch.nn.Parameter,
984
+ loaded_weight: torch.Tensor, weight_name: str,
985
+ shard_id: str, expert_id: int) -> None:
986
+
987
+ expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
988
+ if expert_id == -1:
989
+ return
990
+ quant_method_name = self.quant_method.__class__.__name__
991
+ # compressed-tensors checkpoints with packed weights are stored flipped
992
+ # TODO (mgoin): check self.quant_method.quant_config.quant_format
993
+ # against known CompressionFormat enum values that have this quality
994
+ if self.quant_method.__class__.__name__ in (
995
+ "CompressedTensorsWNA16MarlinMoEMethod",
996
+ "CompressedTensorsWNA16MoEMethod"):
997
+ loaded_weight = loaded_weight.t().contiguous()
998
+
999
+ if shard_id not in ("w1", "w2", "w3"):
1000
+ raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
1001
+ f"got {shard_id}.")
1002
+
1003
+ WEIGHT_SCALE_SUPPORTED = [
1004
+ e.value for e in FusedMoeWeightScaleSupported
1005
+ ]
1006
+ # Fetch the dim to shard the parameter/loaded weight
1007
+ # based on the shard id. This will be whatever
1008
+ # dimension intermediate_size_per_partition is used.
1009
+ SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
1010
+
1011
+ is_gguf_weight = getattr(param, "is_gguf_weight", False)
1012
+ is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
1013
+ if is_gguf_weight_type:
1014
+ param.weight_type = loaded_weight.item()
1015
+ param.data.copy_(loaded_weight)
1016
+ return
1017
+
1018
+ # is_transposed: if the dim to shard the weight
1019
+ # should be flipped. Required by GPTQ, compressed-tensors
1020
+ # should be whatever dimension intermediate_size_per_partition is
1021
+ is_transposed = getattr(param, "is_transposed", False)
1022
+ shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
1023
+ if is_transposed:
1024
+ shard_dim = int(not shard_dim)
1025
+
1026
+ full_load = len(loaded_weight.shape) == 3
1027
+ if full_load:
1028
+ shard_dim += 1
1029
+
1030
+ # Materialize GGUF UninitializedParameter
1031
+ if is_gguf_weight and isinstance(param, UninitializedParameter):
1032
+ final_shape = list(loaded_weight.shape)
1033
+ if shard_id in ["w1", "w3"]:
1034
+ final_shape[1] *= 2
1035
+ final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
1036
+ param.materialize(final_shape, dtype=loaded_weight.dtype)
1037
+
1038
+ expert_data = param.data if full_load else param.data[expert_id]
1039
+ # Case input scale: input_scale loading is only supported for fp8
1040
+ if "input_scale" in weight_name:
1041
+ # this is needed for compressed-tensors only
1042
+ loaded_weight = loaded_weight.to(param.data.device)
1043
+
1044
+ if ("compressed" in quant_method_name.lower()
1045
+ and param.data[expert_id] != 1
1046
+ and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
1047
+ raise ValueError(
1048
+ "input_scales of w1 and w3 of a layer "
1049
+ f"must be equal. But got {param.data[expert_id]} "
1050
+ f"vs. {loaded_weight}")
1051
+
1052
+ self._load_single_value(param=param,
1053
+ loaded_weight=loaded_weight,
1054
+ expert_id=expert_id)
1055
+ return
1056
+
1057
+ # Case g_idx
1058
+ if "g_idx" in weight_name:
1059
+ self._load_g_idx(shard_dim=0,
1060
+ shard_id=shard_id,
1061
+ loaded_weight=loaded_weight,
1062
+ expert_data=expert_data,
1063
+ tp_rank=self.tp_rank)
1064
+ return
1065
+
1066
+ if "ModelOpt" in quant_method_name:
1067
+ if ('weight_scale_2' in weight_name
1068
+ or 'input_scale' in weight_name):
1069
+ self._load_per_tensor_weight_scale(shard_id=shard_id,
1070
+ param=param,
1071
+ loaded_weight=loaded_weight,
1072
+ expert_id=expert_id)
1073
+ elif "weight" in weight_name:
1074
+ self._load_model_weight_or_group_weight_scale(
1075
+ shard_id=shard_id,
1076
+ shard_dim=shard_dim,
1077
+ loaded_weight=loaded_weight,
1078
+ expert_data=expert_data,
1079
+ tp_rank=self.tp_rank)
1080
+ return
1081
+
1082
+ # Case weight scales, zero_points and offset
1083
+ if ("scale" in weight_name or "zero" in weight_name
1084
+ or "offset" in weight_name):
1085
+ # load the weight scales and zp based on the quantization scheme
1086
+ # supported weight scales/zp can be found in
1087
+ # FusedMoeWeightScaleSupported
1088
+ # TODO @dsikka: once hardened, refactor to use vLLM Parameters
1089
+ # specific to each case
1090
+ quant_method = getattr(param, "quant_method", None)
1091
+ if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
1092
+ self._load_per_channel_weight_scale(
1093
+ shard_id=shard_id,
1094
+ shard_dim=shard_dim,
1095
+ loaded_weight=loaded_weight,
1096
+ expert_data=expert_data,
1097
+ tp_rank=self.tp_rank)
1098
+ elif quant_method in [
1099
+ FusedMoeWeightScaleSupported.GROUP.value,
1100
+ FusedMoeWeightScaleSupported.BLOCK.value,
1101
+ ]:
1102
+ self._load_model_weight_or_group_weight_scale(
1103
+ shard_id=shard_id,
1104
+ shard_dim=shard_dim,
1105
+ loaded_weight=loaded_weight,
1106
+ expert_data=expert_data,
1107
+ tp_rank=self.tp_rank,
1108
+ load_full_w2=getattr(param, "load_full_w2", False))
1109
+ elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
1110
+ self._load_per_tensor_weight_scale(shard_id=shard_id,
1111
+ param=param,
1112
+ loaded_weight=loaded_weight,
1113
+ expert_id=expert_id)
1114
+ else:
1115
+ raise ValueError(
1116
+ f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
1117
+ return
1118
+
1119
+ # Case weight_shape
1120
+ if "weight_shape" in weight_name:
1121
+ # only required by compressed-tensors
1122
+ self._load_single_value(param=param,
1123
+ loaded_weight=loaded_weight,
1124
+ expert_id=expert_id)
1125
+ return
1126
+
1127
+ # Case model weights
1128
+ if "weight" in weight_name:
1129
+ self._load_model_weight_or_group_weight_scale(
1130
+ shard_id=shard_id,
1131
+ shard_dim=shard_dim,
1132
+ loaded_weight=loaded_weight,
1133
+ expert_data=expert_data,
1134
+ tp_rank=self.tp_rank)
1135
+ return
1136
+
1137
+ @staticmethod
1138
+ def select_experts(hidden_states: torch.Tensor,
1139
+ router_logits: torch.Tensor,
1140
+ top_k: int,
1141
+ use_grouped_topk: bool,
1142
+ renormalize: bool,
1143
+ topk_group: Optional[int] = None,
1144
+ num_expert_group: Optional[int] = None,
1145
+ custom_routing_function: Optional[Callable] = None,
1146
+ scoring_func: str = "softmax",
1147
+ e_score_correction_bias: Optional[torch.Tensor] = None,
1148
+ indices_type: Optional[torch.dtype] = None):
1149
+ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
1150
+
1151
+ # DeekSeekv2 uses grouped_top_k
1152
+ if use_grouped_topk:
1153
+ assert topk_group is not None
1154
+ assert num_expert_group is not None
1155
+ topk_weights, topk_ids = grouped_topk(
1156
+ hidden_states=hidden_states,
1157
+ gating_output=router_logits,
1158
+ topk=top_k,
1159
+ renormalize=renormalize,
1160
+ num_expert_group=num_expert_group,
1161
+ topk_group=topk_group,
1162
+ scoring_func=scoring_func,
1163
+ e_score_correction_bias=e_score_correction_bias)
1164
+ if indices_type is not None:
1165
+ topk_ids = topk_ids.to(dtype=indices_type)
1166
+ elif custom_routing_function is None:
1167
+ topk_weights, topk_ids, token_expert_indices = fused_topk(
1168
+ hidden_states=hidden_states,
1169
+ gating_output=router_logits,
1170
+ topk=top_k,
1171
+ renormalize=renormalize,
1172
+ indices_type=indices_type,
1173
+ )
1174
+ else:
1175
+ topk_weights, topk_ids = custom_routing_function(
1176
+ hidden_states=hidden_states,
1177
+ gating_output=router_logits,
1178
+ topk=top_k,
1179
+ renormalize=renormalize)
1180
+ if indices_type is not None:
1181
+ topk_ids = topk_ids.to(dtype=indices_type)
1182
+
1183
+ return topk_weights, topk_ids
1184
+
1185
+ def must_reduce_shared_expert_outputs(self) -> bool:
1186
+ """
1187
+ The shared_experts are typically computed using the RowParallelLinear
1188
+ layer. The result of this function is typically used as
1189
+ the reduce_results argument to the module.
1190
+ When just tensor-parallel is used, it is not required to reduce
1191
+ the shared_experts results immediately. Instead we reduce at the
1192
+ once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
1193
+ With EP and the pplx kernels - this is no longer viable as all
1194
+ GPU ranks in DP, produce the complete set of hidden_states.
1195
+ Therefore it is required that we reduce the shared_experts output
1196
+ early.
1197
+ """
1198
+ return self.use_pplx_kernels
1199
+
1200
+ def maybe_all_reduce_tensor_model_parallel(
1201
+ self, final_hidden_states: torch.Tensor):
1202
+ """
1203
+ The pplx combine kernel reduces across GPU ranks by default.
1204
+ """
1205
+ if self.use_pplx_kernels:
1206
+ return final_hidden_states
1207
+ else:
1208
+ return tensor_model_parallel_all_reduce(final_hidden_states)
1209
+
1210
+ def forward(self, hidden_states: torch.Tensor,
1211
+ router_logits: torch.Tensor):
1212
+ if self.use_direct_call:
1213
+ return self.forward_impl(hidden_states, router_logits)
1214
+ else:
1215
+ return torch.ops.vllm.moe_forward(hidden_states, router_logits,
1216
+ self.layer_name)
1217
+
1218
+ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
1219
+ full_router_logits: torch.Tensor):
1220
+
1221
+ full_final_hidden_states = torch.empty_like(full_hidden_states)
1222
+
1223
+ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
1224
+ hidden_states = full_hidden_states[chunk_start:chunk_end, :]
1225
+ router_logits = full_router_logits[chunk_start:chunk_end, :]
1226
+
1227
+ # Matrix multiply.
1228
+ final_hidden_states = self.quant_method.apply(
1229
+ layer=self,
1230
+ x=hidden_states,
1231
+ router_logits=router_logits,
1232
+ top_k=self.top_k,
1233
+ renormalize=self.renormalize,
1234
+ use_grouped_topk=self.use_grouped_topk,
1235
+ global_num_experts=self.global_num_experts,
1236
+ expert_map=self.expert_map,
1237
+ topk_group=self.topk_group,
1238
+ num_expert_group=self.num_expert_group,
1239
+ custom_routing_function=self.custom_routing_function,
1240
+ scoring_func=self.scoring_func,
1241
+ e_score_correction_bias=self.e_score_correction_bias,
1242
+ activation=self.activation,
1243
+ )
1244
+
1245
+ if not skip_result_store:
1246
+ full_final_hidden_states[chunk_start:chunk_end, :].copy_(
1247
+ final_hidden_states)
1248
+
1249
+ ctx = get_forward_context()
1250
+ max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
1251
+ moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE
1252
+
1253
+ num_tokens = full_hidden_states.size(0)
1254
+ for chunk_start_ in range(0, max_tokens_across_dp,
1255
+ moe_dp_chunk_size_per_rank):
1256
+ chunk_start = chunk_start_
1257
+ chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
1258
+ max_tokens_across_dp)
1259
+ # clamp start and end
1260
+ chunk_start = min(chunk_start, num_tokens - 1)
1261
+ chunk_end = min(chunk_end, num_tokens)
1262
+
1263
+ process_chunk(chunk_start,
1264
+ chunk_end,
1265
+ skip_result_store=chunk_start_ >= num_tokens)
1266
+
1267
+ return full_final_hidden_states
1268
+
1269
+ def forward_impl(self, hidden_states: torch.Tensor,
1270
+ router_logits: torch.Tensor):
1271
+ assert self.quant_method is not None
1272
+ if self.moe_parallel_config.use_pplx_kernels:
1273
+ return self.forward_impl_chunked(hidden_states, router_logits)
1274
+
1275
+ if self.dp_size > 1:
1276
+ hidden_states, router_logits = get_ep_group().dispatch(
1277
+ hidden_states, router_logits)
1278
+ # Matrix multiply.
1279
+ final_hidden_states = self.quant_method.apply(
1280
+ layer=self,
1281
+ x=hidden_states,
1282
+ router_logits=router_logits,
1283
+ top_k=self.top_k,
1284
+ renormalize=self.renormalize,
1285
+ use_grouped_topk=self.use_grouped_topk,
1286
+ global_num_experts=self.global_num_experts,
1287
+ expert_map=self.expert_map,
1288
+ topk_group=self.topk_group,
1289
+ num_expert_group=self.num_expert_group,
1290
+ custom_routing_function=self.custom_routing_function,
1291
+ scoring_func=self.scoring_func,
1292
+ e_score_correction_bias=self.e_score_correction_bias,
1293
+ activation=self.activation,
1294
+ apply_router_weight_on_input=self.apply_router_weight_on_input,
1295
+ )
1296
+
1297
+ if self.dp_size > 1:
1298
+ final_hidden_states = get_ep_group().combine(final_hidden_states)
1299
+
1300
+ if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
1301
+ # Default set to False. (May have to add shared expert outputs.)
1302
+ final_hidden_states = tensor_model_parallel_all_reduce(
1303
+ final_hidden_states)
1304
+
1305
+ return final_hidden_states
1306
+
1307
+ @classmethod
1308
+ def make_expert_params_mapping(
1309
+ cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
1310
+ ckpt_up_proj_name: str,
1311
+ num_experts: int) -> list[tuple[str, str, int, str]]:
1312
+
1313
+ return [
1314
+ # (param_name, weight_name, expert_id, shard_id)
1315
+ ("experts.w13_" if weight_name
1316
+ in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
1317
+ f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
1318
+ for expert_id in range(num_experts) for shard_id, weight_name in [
1319
+ ("w1", ckpt_gate_proj_name),
1320
+ ("w2", ckpt_down_proj_name),
1321
+ ("w3", ckpt_up_proj_name),
1322
+ ]
1323
+ ]
1324
+
1325
+ def extra_repr(self) -> str:
1326
+
1327
+ s = (
1328
+ f"global_num_experts={self.global_num_experts}, "
1329
+ f"local_num_experts={self.local_num_experts}, "
1330
+ f"top_k={self.top_k}, "
1331
+ f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501
1332
+ f"tp_size={self.tp_size},\n"
1333
+ f"ep_size={self.ep_size}, "
1334
+ f"reduce_results={self.reduce_results}, "
1335
+ f"renormalize={self.renormalize}, "
1336
+ f"use_grouped_topk={self.use_grouped_topk}")
1337
+
1338
+ if self.use_grouped_topk:
1339
+ s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}" # noqa: E501
1340
+
1341
+ s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'" # noqa: E501
1342
+
1343
+ return s
1344
+
1345
+
1346
+ def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
1347
+ layer_name: str) -> torch.Tensor:
1348
+ forward_context: ForwardContext = get_forward_context()
1349
+ self = forward_context.no_compile_layers[layer_name]
1350
+ assert self.quant_method is not None
1351
+
1352
+ return self.forward_impl(hidden_states, router_logits)
1353
+
1354
+
1355
+ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
1356
+ layer_name: str) -> torch.Tensor:
1357
+ return torch.empty_like(hidden_states)
1358
+
1359
+
1360
+ direct_register_custom_op(
1361
+ op_name="moe_forward",
1362
+ op_func=moe_forward,
1363
+ mutates_args=[],
1364
+ fake_impl=moe_forward_fake,
1365
+ dispatch_key=current_platform.dispatch_key,
1366
+ )