vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1197) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +53 -0
  3. vllm/_custom_ops.py +1828 -0
  4. vllm/_ipex_ops.py +244 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +115 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +308 -0
  20. vllm/attention/backends/blocksparse_attn.py +461 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1498 -0
  23. vllm/attention/backends/flash_attn.py +1003 -0
  24. vllm/attention/backends/flashinfer.py +1104 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +313 -0
  27. vllm/attention/backends/ipex_attn.py +398 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1385 -0
  30. vllm/attention/backends/pallas.py +351 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +975 -0
  34. vllm/attention/backends/torch_sdpa.py +703 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +802 -0
  38. vllm/attention/layer.py +468 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +906 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/prefix_prefill.py +902 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  54. vllm/attention/ops/triton_decode_attention.py +674 -0
  55. vllm/attention/ops/triton_flash_attention.py +979 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  57. vllm/attention/ops/triton_unified_attention.py +334 -0
  58. vllm/attention/selector.py +187 -0
  59. vllm/attention/utils/fa_utils.py +55 -0
  60. vllm/beam_search.py +87 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +1185 -0
  63. vllm/benchmarks/endpoint_request_func.py +381 -0
  64. vllm/benchmarks/latency.py +168 -0
  65. vllm/benchmarks/serve.py +1135 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +70 -0
  68. vllm/collect_env.py +820 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +89 -0
  71. vllm/compilation/backends.py +563 -0
  72. vllm/compilation/base_piecewise_backend.py +72 -0
  73. vllm/compilation/collective_fusion.py +127 -0
  74. vllm/compilation/compiler_interface.py +544 -0
  75. vllm/compilation/counter.py +38 -0
  76. vllm/compilation/cuda_piecewise_backend.py +214 -0
  77. vllm/compilation/decorators.py +250 -0
  78. vllm/compilation/fix_functionalization.py +191 -0
  79. vllm/compilation/fusion.py +618 -0
  80. vllm/compilation/fx_utils.py +62 -0
  81. vllm/compilation/inductor_pass.py +115 -0
  82. vllm/compilation/monitor.py +39 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +137 -0
  85. vllm/compilation/pass_manager.py +78 -0
  86. vllm/compilation/sequence_parallelism.py +268 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +67 -0
  89. vllm/compilation/wrapper.py +135 -0
  90. vllm/config.py +4746 -0
  91. vllm/connections.py +174 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +399 -0
  95. vllm/core/block/common.py +371 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  97. vllm/core/block/interfaces.py +319 -0
  98. vllm/core/block/naive_block.py +466 -0
  99. vllm/core/block/prefix_caching_block.py +1135 -0
  100. vllm/core/block/utils.py +28 -0
  101. vllm/core/block_manager.py +521 -0
  102. vllm/core/evictor.py +157 -0
  103. vllm/core/interfaces.py +135 -0
  104. vllm/core/placeholder_block_space_manager.py +100 -0
  105. vllm/core/scheduler.py +2093 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +281 -0
  108. vllm/distributed/__init__.py +6 -0
  109. vllm/distributed/communication_op.py +41 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +264 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +176 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  120. vllm/distributed/device_communicators/pynccl.py +218 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +341 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  125. vllm/distributed/kv_events.py +356 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +12 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +128 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +108 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +134 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1030 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +384 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +280 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  152. vllm/distributed/parallel_state.py +1296 -0
  153. vllm/distributed/tpu_distributed_utils.py +177 -0
  154. vllm/distributed/utils.py +536 -0
  155. vllm/engine/__init__.py +0 -0
  156. vllm/engine/arg_utils.py +1708 -0
  157. vllm/engine/async_llm_engine.py +1200 -0
  158. vllm/engine/async_timeout.py +173 -0
  159. vllm/engine/llm_engine.py +2097 -0
  160. vllm/engine/metrics.py +629 -0
  161. vllm/engine/metrics_types.py +94 -0
  162. vllm/engine/multiprocessing/__init__.py +148 -0
  163. vllm/engine/multiprocessing/client.py +681 -0
  164. vllm/engine/multiprocessing/engine.py +460 -0
  165. vllm/engine/output_processor/__init__.py +0 -0
  166. vllm/engine/output_processor/interfaces.py +75 -0
  167. vllm/engine/output_processor/multi_step.py +216 -0
  168. vllm/engine/output_processor/single_step.py +145 -0
  169. vllm/engine/output_processor/stop_checker.py +131 -0
  170. vllm/engine/output_processor/util.py +28 -0
  171. vllm/engine/protocol.py +317 -0
  172. vllm/entrypoints/__init__.py +0 -0
  173. vllm/entrypoints/api_server.py +178 -0
  174. vllm/entrypoints/chat_utils.py +1299 -0
  175. vllm/entrypoints/cli/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  177. vllm/entrypoints/cli/benchmark/base.py +39 -0
  178. vllm/entrypoints/cli/benchmark/latency.py +30 -0
  179. vllm/entrypoints/cli/benchmark/main.py +54 -0
  180. vllm/entrypoints/cli/benchmark/serve.py +30 -0
  181. vllm/entrypoints/cli/benchmark/throughput.py +30 -0
  182. vllm/entrypoints/cli/collect_env.py +35 -0
  183. vllm/entrypoints/cli/main.py +65 -0
  184. vllm/entrypoints/cli/openai.py +205 -0
  185. vllm/entrypoints/cli/run_batch.py +62 -0
  186. vllm/entrypoints/cli/serve.py +328 -0
  187. vllm/entrypoints/cli/types.py +25 -0
  188. vllm/entrypoints/launcher.py +147 -0
  189. vllm/entrypoints/llm.py +1544 -0
  190. vllm/entrypoints/logger.py +50 -0
  191. vllm/entrypoints/openai/__init__.py +0 -0
  192. vllm/entrypoints/openai/api_server.py +1387 -0
  193. vllm/entrypoints/openai/cli_args.py +315 -0
  194. vllm/entrypoints/openai/logits_processors.py +90 -0
  195. vllm/entrypoints/openai/protocol.py +1913 -0
  196. vllm/entrypoints/openai/run_batch.py +463 -0
  197. vllm/entrypoints/openai/serving_chat.py +1221 -0
  198. vllm/entrypoints/openai/serving_classification.py +160 -0
  199. vllm/entrypoints/openai/serving_completion.py +592 -0
  200. vllm/entrypoints/openai/serving_embedding.py +201 -0
  201. vllm/entrypoints/openai/serving_engine.py +986 -0
  202. vllm/entrypoints/openai/serving_models.py +315 -0
  203. vllm/entrypoints/openai/serving_pooling.py +232 -0
  204. vllm/entrypoints/openai/serving_score.py +433 -0
  205. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  206. vllm/entrypoints/openai/serving_transcription.py +424 -0
  207. vllm/entrypoints/openai/tool_parsers/__init__.py +23 -0
  208. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  209. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  210. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  211. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  212. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  213. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  214. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  215. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  216. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  217. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  218. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  219. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  220. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  221. vllm/entrypoints/score_utils.py +50 -0
  222. vllm/entrypoints/ssl.py +75 -0
  223. vllm/entrypoints/utils.py +233 -0
  224. vllm/env_override.py +41 -0
  225. vllm/envs.py +944 -0
  226. vllm/executor/__init__.py +0 -0
  227. vllm/executor/executor_base.py +401 -0
  228. vllm/executor/mp_distributed_executor.py +244 -0
  229. vllm/executor/msgspec_utils.py +30 -0
  230. vllm/executor/multiproc_worker_utils.py +313 -0
  231. vllm/executor/ray_distributed_executor.py +701 -0
  232. vllm/executor/ray_utils.py +399 -0
  233. vllm/executor/uniproc_executor.py +139 -0
  234. vllm/forward_context.py +179 -0
  235. vllm/inputs/__init__.py +41 -0
  236. vllm/inputs/data.py +331 -0
  237. vllm/inputs/parse.py +151 -0
  238. vllm/inputs/preprocess.py +909 -0
  239. vllm/inputs/registry.py +237 -0
  240. vllm/jsontree.py +80 -0
  241. vllm/logger.py +212 -0
  242. vllm/logging_utils/__init__.py +8 -0
  243. vllm/logging_utils/dump_input.py +85 -0
  244. vllm/logging_utils/formatter.py +18 -0
  245. vllm/logits_process.py +119 -0
  246. vllm/lora/__init__.py +0 -0
  247. vllm/lora/fully_sharded_layers.py +355 -0
  248. vllm/lora/layers.py +1285 -0
  249. vllm/lora/lora.py +199 -0
  250. vllm/lora/models.py +818 -0
  251. vllm/lora/ops/__init__.py +0 -0
  252. vllm/lora/ops/torch_ops/__init__.py +16 -0
  253. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  254. vllm/lora/ops/triton_ops/__init__.py +12 -0
  255. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  256. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  257. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  258. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  259. vllm/lora/ops/triton_ops/utils.py +120 -0
  260. vllm/lora/ops/xla_ops/__init__.py +7 -0
  261. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  262. vllm/lora/peft_helper.py +136 -0
  263. vllm/lora/punica_wrapper/__init__.py +10 -0
  264. vllm/lora/punica_wrapper/punica_base.py +485 -0
  265. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  266. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  267. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  268. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  269. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  270. vllm/lora/punica_wrapper/utils.py +164 -0
  271. vllm/lora/request.py +99 -0
  272. vllm/lora/resolver.py +85 -0
  273. vllm/lora/utils.py +240 -0
  274. vllm/lora/worker_manager.py +259 -0
  275. vllm/model_executor/__init__.py +16 -0
  276. vllm/model_executor/custom_op.py +152 -0
  277. vllm/model_executor/guided_decoding/__init__.py +181 -0
  278. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  279. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  280. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  281. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  282. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  283. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  284. vllm/model_executor/guided_decoding/utils.py +242 -0
  285. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  286. vllm/model_executor/layers/__init__.py +0 -0
  287. vllm/model_executor/layers/activation.py +369 -0
  288. vllm/model_executor/layers/fused_moe/__init__.py +54 -0
  289. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +125 -0
  290. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +117 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  455. vllm/model_executor/layers/fused_moe/cutlass_moe.py +461 -0
  456. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +240 -0
  457. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +240 -0
  458. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +186 -0
  459. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +775 -0
  460. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +232 -0
  461. vllm/model_executor/layers/fused_moe/fused_moe.py +1724 -0
  462. vllm/model_executor/layers/fused_moe/layer.py +1535 -0
  463. vllm/model_executor/layers/fused_moe/modular_kernel.py +446 -0
  464. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  465. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  466. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  467. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  468. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +159 -0
  469. vllm/model_executor/layers/fused_moe/prepare_finalize.py +69 -0
  470. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +421 -0
  471. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +117 -0
  472. vllm/model_executor/layers/fused_moe/utils.py +98 -0
  473. vllm/model_executor/layers/layernorm.py +288 -0
  474. vllm/model_executor/layers/lightning_attn.py +652 -0
  475. vllm/model_executor/layers/linear.py +1524 -0
  476. vllm/model_executor/layers/logits_processor.py +197 -0
  477. vllm/model_executor/layers/mamba/__init__.py +0 -0
  478. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  479. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  480. vllm/model_executor/layers/mamba/mamba_mixer2.py +616 -0
  481. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  482. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  483. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  484. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  485. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  486. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  487. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  488. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  489. vllm/model_executor/layers/pooler.py +350 -0
  490. vllm/model_executor/layers/quantization/__init__.py +157 -0
  491. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  492. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  493. vllm/model_executor/layers/quantization/awq.py +194 -0
  494. vllm/model_executor/layers/quantization/awq_marlin.py +519 -0
  495. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  496. vllm/model_executor/layers/quantization/base_config.py +151 -0
  497. vllm/model_executor/layers/quantization/bitblas.py +461 -0
  498. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +668 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1260 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  505. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  506. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +93 -0
  507. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +178 -0
  508. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  509. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  510. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  511. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  512. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  513. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  514. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  515. vllm/model_executor/layers/quantization/experts_int8.py +196 -0
  516. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  517. vllm/model_executor/layers/quantization/fp8.py +906 -0
  518. vllm/model_executor/layers/quantization/gguf.py +565 -0
  519. vllm/model_executor/layers/quantization/gptq.py +278 -0
  520. vllm/model_executor/layers/quantization/gptq_bitblas.py +445 -0
  521. vllm/model_executor/layers/quantization/gptq_marlin.py +648 -0
  522. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  523. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  524. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  525. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  526. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  527. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  528. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  529. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  530. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  531. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +120 -0
  532. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  533. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  534. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  535. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  536. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  537. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  538. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  539. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  540. vllm/model_executor/layers/quantization/marlin.py +261 -0
  541. vllm/model_executor/layers/quantization/modelopt.py +737 -0
  542. vllm/model_executor/layers/quantization/moe_wna16.py +449 -0
  543. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  544. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  545. vllm/model_executor/layers/quantization/qqq.py +275 -0
  546. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  547. vllm/model_executor/layers/quantization/quark/quark.py +441 -0
  548. vllm/model_executor/layers/quantization/quark/quark_moe.py +237 -0
  549. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  550. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  551. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  552. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +146 -0
  553. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  554. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  555. vllm/model_executor/layers/quantization/schema.py +86 -0
  556. vllm/model_executor/layers/quantization/torchao.py +161 -0
  557. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  558. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  559. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  560. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/fp8_utils.py +618 -0
  764. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  765. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  766. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  767. vllm/model_executor/layers/quantization/utils/machete_utils.py +33 -0
  768. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  769. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  770. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  771. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  772. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  773. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  774. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  775. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +104 -0
  776. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  777. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  778. vllm/model_executor/layers/rejection_sampler.py +406 -0
  779. vllm/model_executor/layers/resampler.py +270 -0
  780. vllm/model_executor/layers/rotary_embedding.py +1862 -0
  781. vllm/model_executor/layers/sampler.py +1204 -0
  782. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  783. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  784. vllm/model_executor/layers/utils.py +95 -0
  785. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  786. vllm/model_executor/model_loader/__init__.py +76 -0
  787. vllm/model_executor/model_loader/base_loader.py +43 -0
  788. vllm/model_executor/model_loader/bitsandbytes_loader.py +570 -0
  789. vllm/model_executor/model_loader/default_loader.py +282 -0
  790. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  791. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  792. vllm/model_executor/model_loader/neuron.py +476 -0
  793. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  794. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  795. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  796. vllm/model_executor/model_loader/tensorizer.py +600 -0
  797. vllm/model_executor/model_loader/tensorizer_loader.py +123 -0
  798. vllm/model_executor/model_loader/tpu.py +112 -0
  799. vllm/model_executor/model_loader/utils.py +302 -0
  800. vllm/model_executor/model_loader/weight_utils.py +782 -0
  801. vllm/model_executor/models/__init__.py +28 -0
  802. vllm/model_executor/models/adapters.py +248 -0
  803. vllm/model_executor/models/aimv2.py +246 -0
  804. vllm/model_executor/models/arctic.py +559 -0
  805. vllm/model_executor/models/aria.py +657 -0
  806. vllm/model_executor/models/aya_vision.py +466 -0
  807. vllm/model_executor/models/baichuan.py +474 -0
  808. vllm/model_executor/models/bamba.py +543 -0
  809. vllm/model_executor/models/bart.py +938 -0
  810. vllm/model_executor/models/bert.py +523 -0
  811. vllm/model_executor/models/bert_with_rope.py +769 -0
  812. vllm/model_executor/models/blip.py +339 -0
  813. vllm/model_executor/models/blip2.py +718 -0
  814. vllm/model_executor/models/bloom.py +373 -0
  815. vllm/model_executor/models/chameleon.py +1136 -0
  816. vllm/model_executor/models/chatglm.py +478 -0
  817. vllm/model_executor/models/clip.py +407 -0
  818. vllm/model_executor/models/commandr.py +472 -0
  819. vllm/model_executor/models/constant_size_cache.py +137 -0
  820. vllm/model_executor/models/dbrx.py +472 -0
  821. vllm/model_executor/models/deepseek.py +486 -0
  822. vllm/model_executor/models/deepseek_mtp.py +269 -0
  823. vllm/model_executor/models/deepseek_v2.py +843 -0
  824. vllm/model_executor/models/deepseek_vl2.py +648 -0
  825. vllm/model_executor/models/eagle.py +260 -0
  826. vllm/model_executor/models/exaone.py +551 -0
  827. vllm/model_executor/models/fairseq2_llama.py +154 -0
  828. vllm/model_executor/models/falcon.py +510 -0
  829. vllm/model_executor/models/falcon_h1.py +685 -0
  830. vllm/model_executor/models/florence2.py +1103 -0
  831. vllm/model_executor/models/fuyu.py +389 -0
  832. vllm/model_executor/models/gemma.py +425 -0
  833. vllm/model_executor/models/gemma2.py +425 -0
  834. vllm/model_executor/models/gemma3.py +533 -0
  835. vllm/model_executor/models/gemma3_mm.py +709 -0
  836. vllm/model_executor/models/glm.py +23 -0
  837. vllm/model_executor/models/glm4.py +305 -0
  838. vllm/model_executor/models/glm4v.py +648 -0
  839. vllm/model_executor/models/gpt2.py +328 -0
  840. vllm/model_executor/models/gpt_bigcode.py +335 -0
  841. vllm/model_executor/models/gpt_j.py +339 -0
  842. vllm/model_executor/models/gpt_neox.py +332 -0
  843. vllm/model_executor/models/granite.py +493 -0
  844. vllm/model_executor/models/granite_speech.py +779 -0
  845. vllm/model_executor/models/granitemoe.py +437 -0
  846. vllm/model_executor/models/granitemoehybrid.py +586 -0
  847. vllm/model_executor/models/granitemoeshared.py +341 -0
  848. vllm/model_executor/models/gritlm.py +224 -0
  849. vllm/model_executor/models/grok1.py +546 -0
  850. vllm/model_executor/models/h2ovl.py +546 -0
  851. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  852. vllm/model_executor/models/idefics3.py +776 -0
  853. vllm/model_executor/models/interfaces.py +572 -0
  854. vllm/model_executor/models/interfaces_base.py +164 -0
  855. vllm/model_executor/models/intern_vit.py +480 -0
  856. vllm/model_executor/models/internlm2.py +455 -0
  857. vllm/model_executor/models/internlm2_ve.py +147 -0
  858. vllm/model_executor/models/internvl.py +1418 -0
  859. vllm/model_executor/models/jais.py +373 -0
  860. vllm/model_executor/models/jamba.py +592 -0
  861. vllm/model_executor/models/kimi_vl.py +577 -0
  862. vllm/model_executor/models/llama.py +644 -0
  863. vllm/model_executor/models/llama4.py +532 -0
  864. vllm/model_executor/models/llama_eagle.py +165 -0
  865. vllm/model_executor/models/llama_eagle3.py +263 -0
  866. vllm/model_executor/models/llava.py +866 -0
  867. vllm/model_executor/models/llava_next.py +586 -0
  868. vllm/model_executor/models/llava_next_video.py +471 -0
  869. vllm/model_executor/models/llava_onevision.py +956 -0
  870. vllm/model_executor/models/mamba.py +273 -0
  871. vllm/model_executor/models/mamba2.py +308 -0
  872. vllm/model_executor/models/mamba_cache.py +76 -0
  873. vllm/model_executor/models/medusa.py +219 -0
  874. vllm/model_executor/models/mimo.py +192 -0
  875. vllm/model_executor/models/mimo_mtp.py +285 -0
  876. vllm/model_executor/models/minicpm.py +592 -0
  877. vllm/model_executor/models/minicpm3.py +230 -0
  878. vllm/model_executor/models/minicpm_eagle.py +391 -0
  879. vllm/model_executor/models/minicpmo.py +759 -0
  880. vllm/model_executor/models/minicpmv.py +1287 -0
  881. vllm/model_executor/models/minimax_cache.py +36 -0
  882. vllm/model_executor/models/minimax_text_01.py +1301 -0
  883. vllm/model_executor/models/minimax_vl_01.py +364 -0
  884. vllm/model_executor/models/mistral3.py +604 -0
  885. vllm/model_executor/models/mixtral.py +488 -0
  886. vllm/model_executor/models/mixtral_quant.py +453 -0
  887. vllm/model_executor/models/mllama.py +1624 -0
  888. vllm/model_executor/models/mllama4.py +938 -0
  889. vllm/model_executor/models/mlp_speculator.py +206 -0
  890. vllm/model_executor/models/modernbert.py +331 -0
  891. vllm/model_executor/models/module_mapping.py +72 -0
  892. vllm/model_executor/models/molmo.py +1568 -0
  893. vllm/model_executor/models/moonvit.py +630 -0
  894. vllm/model_executor/models/mpt.py +331 -0
  895. vllm/model_executor/models/nemotron.py +508 -0
  896. vllm/model_executor/models/nemotron_h.py +573 -0
  897. vllm/model_executor/models/nemotron_nas.py +484 -0
  898. vllm/model_executor/models/nvlm_d.py +216 -0
  899. vllm/model_executor/models/olmo.py +389 -0
  900. vllm/model_executor/models/olmo2.py +414 -0
  901. vllm/model_executor/models/olmoe.py +468 -0
  902. vllm/model_executor/models/opt.py +412 -0
  903. vllm/model_executor/models/orion.py +349 -0
  904. vllm/model_executor/models/ovis.py +567 -0
  905. vllm/model_executor/models/paligemma.py +398 -0
  906. vllm/model_executor/models/persimmon.py +344 -0
  907. vllm/model_executor/models/phi.py +356 -0
  908. vllm/model_executor/models/phi3.py +19 -0
  909. vllm/model_executor/models/phi3_small.py +465 -0
  910. vllm/model_executor/models/phi3v.py +723 -0
  911. vllm/model_executor/models/phi4mm.py +1246 -0
  912. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  913. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  914. vllm/model_executor/models/phimoe.py +665 -0
  915. vllm/model_executor/models/pixtral.py +1316 -0
  916. vllm/model_executor/models/plamo2.py +738 -0
  917. vllm/model_executor/models/prithvi_geospatial_mae.py +232 -0
  918. vllm/model_executor/models/qwen.py +362 -0
  919. vllm/model_executor/models/qwen2.py +497 -0
  920. vllm/model_executor/models/qwen2_5_omni_thinker.py +904 -0
  921. vllm/model_executor/models/qwen2_5_vl.py +1166 -0
  922. vllm/model_executor/models/qwen2_audio.py +410 -0
  923. vllm/model_executor/models/qwen2_moe.py +540 -0
  924. vllm/model_executor/models/qwen2_rm.py +132 -0
  925. vllm/model_executor/models/qwen2_vl.py +1405 -0
  926. vllm/model_executor/models/qwen3.py +321 -0
  927. vllm/model_executor/models/qwen3_moe.py +535 -0
  928. vllm/model_executor/models/qwen_vl.py +785 -0
  929. vllm/model_executor/models/registry.py +622 -0
  930. vllm/model_executor/models/roberta.py +276 -0
  931. vllm/model_executor/models/siglip.py +524 -0
  932. vllm/model_executor/models/skyworkr1v.py +951 -0
  933. vllm/model_executor/models/smolvlm.py +52 -0
  934. vllm/model_executor/models/solar.py +506 -0
  935. vllm/model_executor/models/stablelm.py +343 -0
  936. vllm/model_executor/models/starcoder2.py +356 -0
  937. vllm/model_executor/models/tarsier.py +643 -0
  938. vllm/model_executor/models/telechat2.py +140 -0
  939. vllm/model_executor/models/teleflm.py +79 -0
  940. vllm/model_executor/models/transformers.py +508 -0
  941. vllm/model_executor/models/ultravox.py +656 -0
  942. vllm/model_executor/models/utils.py +731 -0
  943. vllm/model_executor/models/vision.py +147 -0
  944. vllm/model_executor/models/whisper.py +747 -0
  945. vllm/model_executor/models/zamba2.py +1009 -0
  946. vllm/model_executor/parameter.py +459 -0
  947. vllm/model_executor/pooling_metadata.py +72 -0
  948. vllm/model_executor/sampling_metadata.py +597 -0
  949. vllm/model_executor/utils.py +77 -0
  950. vllm/multimodal/__init__.py +33 -0
  951. vllm/multimodal/audio.py +106 -0
  952. vllm/multimodal/base.py +219 -0
  953. vllm/multimodal/hasher.py +118 -0
  954. vllm/multimodal/image.py +97 -0
  955. vllm/multimodal/inputs.py +876 -0
  956. vllm/multimodal/parse.py +461 -0
  957. vllm/multimodal/processing.py +1895 -0
  958. vllm/multimodal/profiling.py +258 -0
  959. vllm/multimodal/registry.py +331 -0
  960. vllm/multimodal/utils.py +436 -0
  961. vllm/multimodal/video.py +198 -0
  962. vllm/outputs.py +512 -0
  963. vllm/platforms/__init__.py +291 -0
  964. vllm/platforms/cpu.py +266 -0
  965. vllm/platforms/cuda.py +526 -0
  966. vllm/platforms/hpu.py +106 -0
  967. vllm/platforms/interface.py +538 -0
  968. vllm/platforms/neuron.py +150 -0
  969. vllm/platforms/rocm.py +435 -0
  970. vllm/platforms/tpu.py +216 -0
  971. vllm/platforms/xpu.py +156 -0
  972. vllm/plugins/__init__.py +94 -0
  973. vllm/plugins/lora_resolvers/README.md +15 -0
  974. vllm/plugins/lora_resolvers/__init__.py +0 -0
  975. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  976. vllm/pooling_params.py +54 -0
  977. vllm/profiler/__init__.py +0 -0
  978. vllm/profiler/layerwise_profile.py +375 -0
  979. vllm/profiler/utils.py +148 -0
  980. vllm/prompt_adapter/__init__.py +0 -0
  981. vllm/prompt_adapter/layers.py +83 -0
  982. vllm/prompt_adapter/models.py +358 -0
  983. vllm/prompt_adapter/request.py +37 -0
  984. vllm/prompt_adapter/utils.py +98 -0
  985. vllm/prompt_adapter/worker_manager.py +179 -0
  986. vllm/py.typed +2 -0
  987. vllm/reasoning/__init__.py +15 -0
  988. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  989. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  990. vllm/reasoning/granite_reasoning_parser.py +363 -0
  991. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  992. vllm/sampling_params.py +602 -0
  993. vllm/scalar_type.py +347 -0
  994. vllm/scripts.py +15 -0
  995. vllm/sequence.py +1568 -0
  996. vllm/spec_decode/__init__.py +0 -0
  997. vllm/spec_decode/batch_expansion.py +506 -0
  998. vllm/spec_decode/draft_model_runner.py +349 -0
  999. vllm/spec_decode/interfaces.py +99 -0
  1000. vllm/spec_decode/medusa_worker.py +138 -0
  1001. vllm/spec_decode/metrics.py +213 -0
  1002. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1003. vllm/spec_decode/mqa_scorer.py +160 -0
  1004. vllm/spec_decode/multi_step_worker.py +423 -0
  1005. vllm/spec_decode/ngram_worker.py +196 -0
  1006. vllm/spec_decode/proposer_worker_base.py +59 -0
  1007. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1008. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1009. vllm/spec_decode/target_model_runner.py +45 -0
  1010. vllm/spec_decode/top1_proposer.py +275 -0
  1011. vllm/spec_decode/util.py +277 -0
  1012. vllm/test_utils.py +130 -0
  1013. vllm/third_party/__init__.py +0 -0
  1014. vllm/third_party/pynvml.py +6140 -0
  1015. vllm/tracing.py +131 -0
  1016. vllm/transformers_utils/__init__.py +24 -0
  1017. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1018. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1019. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1020. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1021. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1022. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1023. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1024. vllm/transformers_utils/config.py +887 -0
  1025. vllm/transformers_utils/configs/__init__.py +61 -0
  1026. vllm/transformers_utils/configs/arctic.py +207 -0
  1027. vllm/transformers_utils/configs/chatglm.py +72 -0
  1028. vllm/transformers_utils/configs/cohere2.py +195 -0
  1029. vllm/transformers_utils/configs/dbrx.py +280 -0
  1030. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1031. vllm/transformers_utils/configs/eagle.py +85 -0
  1032. vllm/transformers_utils/configs/exaone.py +190 -0
  1033. vllm/transformers_utils/configs/falcon.py +90 -0
  1034. vllm/transformers_utils/configs/h2ovl.py +16 -0
  1035. vllm/transformers_utils/configs/internvl.py +54 -0
  1036. vllm/transformers_utils/configs/jais.py +238 -0
  1037. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1038. vllm/transformers_utils/configs/medusa.py +63 -0
  1039. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1040. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1041. vllm/transformers_utils/configs/mllama.py +31 -0
  1042. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1043. vllm/transformers_utils/configs/moonvit.py +33 -0
  1044. vllm/transformers_utils/configs/mpt.py +180 -0
  1045. vllm/transformers_utils/configs/nemotron.py +205 -0
  1046. vllm/transformers_utils/configs/nemotron_h.py +258 -0
  1047. vllm/transformers_utils/configs/nvlm_d.py +15 -0
  1048. vllm/transformers_utils/configs/ovis.py +184 -0
  1049. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1050. vllm/transformers_utils/configs/solar.py +247 -0
  1051. vllm/transformers_utils/configs/telechat2.py +64 -0
  1052. vllm/transformers_utils/configs/ultravox.py +108 -0
  1053. vllm/transformers_utils/detokenizer.py +168 -0
  1054. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1055. vllm/transformers_utils/processor.py +221 -0
  1056. vllm/transformers_utils/processors/__init__.py +8 -0
  1057. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1058. vllm/transformers_utils/processors/ovis.py +420 -0
  1059. vllm/transformers_utils/s3_utils.py +162 -0
  1060. vllm/transformers_utils/tokenizer.py +302 -0
  1061. vllm/transformers_utils/tokenizer_base.py +149 -0
  1062. vllm/transformers_utils/tokenizer_group.py +120 -0
  1063. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1064. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1065. vllm/transformers_utils/utils.py +99 -0
  1066. vllm/triton_utils/__init__.py +14 -0
  1067. vllm/triton_utils/importing.py +50 -0
  1068. vllm/usage/__init__.py +0 -0
  1069. vllm/usage/usage_lib.py +256 -0
  1070. vllm/utils.py +2910 -0
  1071. vllm/v1/__init__.py +0 -0
  1072. vllm/v1/attention/__init__.py +0 -0
  1073. vllm/v1/attention/backends/__init__.py +0 -0
  1074. vllm/v1/attention/backends/cpu_attn.py +163 -0
  1075. vllm/v1/attention/backends/flash_attn.py +869 -0
  1076. vllm/v1/attention/backends/flashinfer.py +651 -0
  1077. vllm/v1/attention/backends/flex_attention.py +477 -0
  1078. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1079. vllm/v1/attention/backends/mla/common.py +931 -0
  1080. vllm/v1/attention/backends/mla/cutlass_mla.py +97 -0
  1081. vllm/v1/attention/backends/mla/flashmla.py +152 -0
  1082. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +220 -0
  1083. vllm/v1/attention/backends/mla/triton_mla.py +120 -0
  1084. vllm/v1/attention/backends/pallas.py +240 -0
  1085. vllm/v1/attention/backends/triton_attn.py +285 -0
  1086. vllm/v1/attention/backends/utils.py +52 -0
  1087. vllm/v1/core/__init__.py +0 -0
  1088. vllm/v1/core/block_pool.py +349 -0
  1089. vllm/v1/core/encoder_cache_manager.py +150 -0
  1090. vllm/v1/core/kv_cache_coordinator.py +363 -0
  1091. vllm/v1/core/kv_cache_manager.py +392 -0
  1092. vllm/v1/core/kv_cache_utils.py +996 -0
  1093. vllm/v1/core/sched/__init__.py +0 -0
  1094. vllm/v1/core/sched/interface.py +150 -0
  1095. vllm/v1/core/sched/output.py +154 -0
  1096. vllm/v1/core/sched/scheduler.py +1044 -0
  1097. vllm/v1/core/sched/utils.py +23 -0
  1098. vllm/v1/core/single_type_kv_cache_manager.py +403 -0
  1099. vllm/v1/engine/__init__.py +173 -0
  1100. vllm/v1/engine/async_llm.py +558 -0
  1101. vllm/v1/engine/coordinator.py +253 -0
  1102. vllm/v1/engine/core.py +961 -0
  1103. vllm/v1/engine/core_client.py +1129 -0
  1104. vllm/v1/engine/detokenizer.py +261 -0
  1105. vllm/v1/engine/exceptions.py +17 -0
  1106. vllm/v1/engine/llm_engine.py +317 -0
  1107. vllm/v1/engine/logprobs.py +199 -0
  1108. vllm/v1/engine/mm_input_cache.py +91 -0
  1109. vllm/v1/engine/output_processor.py +428 -0
  1110. vllm/v1/engine/parallel_sampling.py +133 -0
  1111. vllm/v1/engine/processor.py +407 -0
  1112. vllm/v1/executor/__init__.py +0 -0
  1113. vllm/v1/executor/abstract.py +113 -0
  1114. vllm/v1/executor/multiproc_executor.py +537 -0
  1115. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1116. vllm/v1/kv_cache_interface.py +194 -0
  1117. vllm/v1/metrics/__init__.py +0 -0
  1118. vllm/v1/metrics/loggers.py +523 -0
  1119. vllm/v1/metrics/prometheus.py +82 -0
  1120. vllm/v1/metrics/ray_wrappers.py +131 -0
  1121. vllm/v1/metrics/reader.py +246 -0
  1122. vllm/v1/metrics/stats.py +239 -0
  1123. vllm/v1/outputs.py +116 -0
  1124. vllm/v1/request.py +193 -0
  1125. vllm/v1/sample/__init__.py +0 -0
  1126. vllm/v1/sample/metadata.py +44 -0
  1127. vllm/v1/sample/ops/__init__.py +0 -0
  1128. vllm/v1/sample/ops/bad_words.py +39 -0
  1129. vllm/v1/sample/ops/penalties.py +59 -0
  1130. vllm/v1/sample/ops/topk_topp_sampler.py +293 -0
  1131. vllm/v1/sample/rejection_sampler.py +631 -0
  1132. vllm/v1/sample/sampler.py +286 -0
  1133. vllm/v1/sample/tpu/__init__.py +0 -0
  1134. vllm/v1/sample/tpu/metadata.py +124 -0
  1135. vllm/v1/sample/tpu/sampler.py +145 -0
  1136. vllm/v1/serial_utils.py +315 -0
  1137. vllm/v1/spec_decode/__init__.py +0 -0
  1138. vllm/v1/spec_decode/eagle.py +432 -0
  1139. vllm/v1/spec_decode/medusa.py +62 -0
  1140. vllm/v1/spec_decode/metadata.py +62 -0
  1141. vllm/v1/spec_decode/metrics.py +178 -0
  1142. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1143. vllm/v1/spec_decode/utils.py +46 -0
  1144. vllm/v1/structured_output/__init__.py +222 -0
  1145. vllm/v1/structured_output/backend_guidance.py +245 -0
  1146. vllm/v1/structured_output/backend_types.py +134 -0
  1147. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1148. vllm/v1/structured_output/request.py +86 -0
  1149. vllm/v1/structured_output/utils.py +175 -0
  1150. vllm/v1/utils.py +743 -0
  1151. vllm/v1/worker/__init__.py +0 -0
  1152. vllm/v1/worker/block_table.py +142 -0
  1153. vllm/v1/worker/cpu_model_runner.py +86 -0
  1154. vllm/v1/worker/cpu_worker.py +152 -0
  1155. vllm/v1/worker/gpu_input_batch.py +681 -0
  1156. vllm/v1/worker/gpu_model_runner.py +2320 -0
  1157. vllm/v1/worker/gpu_worker.py +393 -0
  1158. vllm/v1/worker/lora_model_runner_mixin.py +173 -0
  1159. vllm/v1/worker/tpu_model_runner.py +1673 -0
  1160. vllm/v1/worker/tpu_worker.py +299 -0
  1161. vllm/v1/worker/utils.py +111 -0
  1162. vllm/v1/worker/worker_base.py +65 -0
  1163. vllm/version.py +41 -0
  1164. vllm/vllm_flash_attn/.gitkeep +0 -0
  1165. vllm/worker/__init__.py +0 -0
  1166. vllm/worker/cache_engine.py +145 -0
  1167. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1168. vllm/worker/cpu_model_runner.py +671 -0
  1169. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1170. vllm/worker/cpu_worker.py +450 -0
  1171. vllm/worker/enc_dec_model_runner.py +555 -0
  1172. vllm/worker/hpu_model_runner.py +2320 -0
  1173. vllm/worker/hpu_worker.py +484 -0
  1174. vllm/worker/model_runner.py +2178 -0
  1175. vllm/worker/model_runner_base.py +282 -0
  1176. vllm/worker/multi_step_hpu_worker.py +123 -0
  1177. vllm/worker/multi_step_model_runner.py +911 -0
  1178. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1179. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1180. vllm/worker/multi_step_tpu_worker.py +108 -0
  1181. vllm/worker/multi_step_worker.py +197 -0
  1182. vllm/worker/neuron_model_runner.py +460 -0
  1183. vllm/worker/neuron_worker.py +193 -0
  1184. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1185. vllm/worker/pooling_model_runner.py +211 -0
  1186. vllm/worker/tpu_model_runner.py +909 -0
  1187. vllm/worker/tpu_worker.py +337 -0
  1188. vllm/worker/utils.py +53 -0
  1189. vllm/worker/worker.py +577 -0
  1190. vllm/worker/worker_base.py +646 -0
  1191. vllm/worker/xpu_model_runner.py +606 -0
  1192. vllm/worker/xpu_worker.py +186 -0
  1193. vllm_cpu_amxbf16-0.9.1.dist-info/METADATA +305 -0
  1194. vllm_cpu_amxbf16-0.9.1.dist-info/RECORD +1197 -0
  1195. vllm_cpu_amxbf16-0.9.1.dist-info/WHEEL +5 -0
  1196. vllm_cpu_amxbf16-0.9.1.dist-info/entry_points.txt +5 -0
  1197. vllm_cpu_amxbf16-0.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1535 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import importlib
5
+ from abc import abstractmethod
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from typing import Callable, Optional, Union
9
+
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from compressed_tensors.quantization import (QuantizationArgs,
13
+ QuantizationStrategy,
14
+ QuantizationType)
15
+ from torch.nn.parameter import UninitializedParameter
16
+
17
+ import vllm.envs as envs
18
+ from vllm.config import ParallelConfig, get_current_vllm_config
19
+ from vllm.distributed import (get_dp_group, get_ep_group,
20
+ get_tensor_model_parallel_rank,
21
+ get_tensor_model_parallel_world_size,
22
+ tensor_model_parallel_all_reduce)
23
+ from vllm.forward_context import ForwardContext, get_forward_context
24
+ from vllm.logger import init_logger
25
+ from vllm.model_executor.custom_op import CustomOp
26
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
27
+ is_rocm_aiter_moe_enabled)
28
+ from vllm.model_executor.layers.quantization.base_config import (
29
+ QuantizationConfig, QuantizeMethodBase)
30
+ from vllm.model_executor.utils import set_weight_attrs
31
+ from vllm.platforms import current_platform
32
+ from vllm.platforms.interface import CpuArchEnum
33
+ from vllm.utils import direct_register_custom_op
34
+
35
+ has_pplx = importlib.util.find_spec("pplx_kernels") is not None
36
+ has_deepep = importlib.util.find_spec("deep_ep") is not None
37
+
38
+ if current_platform.is_cuda_alike():
39
+ from .fused_batched_moe import BatchedTritonExperts
40
+ from .fused_moe import TritonExperts, fused_experts
41
+ from .modular_kernel import (FusedMoEModularKernel,
42
+ FusedMoEPermuteExpertsUnpermute,
43
+ FusedMoEPrepareAndFinalize)
44
+ if has_pplx:
45
+ from .pplx_prepare_finalize import PplxPrepareAndFinalize
46
+ if has_deepep:
47
+ from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
48
+ from .deepep_ll_prepare_finalize import DeepEPLLPrepareAndFinalize
49
+ else:
50
+ fused_experts = None # type: ignore
51
+ FusedMoEPermuteExpertsUnpermute = None # type: ignore
52
+ FusedMoEPrepareAndFinalize = None # type: ignore
53
+ if is_rocm_aiter_moe_enabled():
54
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
55
+ rocm_aiter_grouped_topk as grouped_topk)
56
+ else:
57
+ from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
58
+ if current_platform.is_tpu():
59
+ from .moe_pallas import fused_moe as fused_moe_pallas
60
+ else:
61
+ fused_moe_pallas = None # type: ignore
62
+ logger = init_logger(__name__)
63
+
64
+ # Note: this limit is somewhat arbitrary and might be changed later.
65
+ # The size of the activations will be E x MOE_DP_CHUNK_SIZE x hidden_dim.
66
+ MOE_DP_CHUNK_SIZE = 256
67
+
68
+
69
+ @dataclass
70
+ class FusedMoEParallelConfig:
71
+ tp_size: int
72
+ dp_size: int
73
+ ep_size: int
74
+ tp_rank: int
75
+ dp_rank: int
76
+ ep_rank: int
77
+
78
+ use_ep: bool # whether to use EP or not
79
+
80
+ @property
81
+ def use_all2all_kernels(self):
82
+ return self.dp_size > 1 and self.use_ep
83
+
84
+ @property
85
+ def use_pplx_kernels(self):
86
+ return (self.use_all2all_kernels
87
+ and envs.VLLM_ALL2ALL_BACKEND == "pplx")
88
+
89
+ @property
90
+ def use_deepep_ht_kernels(self):
91
+ return (self.use_all2all_kernels
92
+ and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput")
93
+
94
+ @property
95
+ def use_deepep_ll_kernels(self):
96
+ return (self.use_all2all_kernels
97
+ and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
98
+
99
+ @staticmethod
100
+ def make(tp_size_: int, dp_size_: int,
101
+ vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
102
+ """
103
+ Determine MoE parallel configuration. Based on the input tp_size_,
104
+ dp_size_, ep_size_ and vllm's parallel config, determine what
105
+ level's of parallelism to use in the fused moe layer.
106
+
107
+ Args:
108
+ tp_size_ (int): tp_size passed into the FusedMoE constructor.
109
+ dp_size_ (int): dp_size passed into the FusedMoE constructor.
110
+ ep_size_ (int): ep_size passed into the FusedMoE constructor.
111
+ vllm_parallel_config (ParallelConfig): vllm's parallel config
112
+ object.
113
+
114
+ Examples:
115
+ When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1,
116
+ we simply return the sizes unaltered and the ranks set to 0.
117
+
118
+ Expert Parallelism is considered only when either dp_size_ or tp_size_
119
+ is non trivial.
120
+
121
+ When TP = 2, DP = 1 and EP = False, the configuration on different
122
+ devices,
123
+ - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
124
+ legend : {size, rank}
125
+ - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
126
+ - Comment : Tensors are sharded across 2 devices.
127
+
128
+ When TP = 1, DP = 2 and EP = False, the configuration on different
129
+ devices,
130
+ - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
131
+ - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
132
+ - Comment: There are 2 engine instances and the tensors are sharded
133
+ across 2 decvices.
134
+
135
+ When TP = 2, DP = 2 and EP = False, the configuration on different
136
+ devices,
137
+ - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
138
+ - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
139
+ - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
140
+ - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
141
+ - Comment: There are 2 engine instances and the tensors are sharded
142
+ across 4 devices.
143
+
144
+ When, TP = 2, DP = 1 and EP = True, the configuration on different
145
+ devices,
146
+ - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
147
+ - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
148
+ - Comment: The experts are split between the 2 devices.
149
+
150
+ When, TP = 1, DP = 2 and EP = True, the configuration on different
151
+ devices,
152
+ - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
153
+ - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
154
+ - Comment: There are 2 engine instances and the experts are split
155
+ between the 2 devices.
156
+
157
+ When TP = 2, DP = 2 and EP = True, the configuration on different
158
+ devices,
159
+ - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
160
+ - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
161
+ - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
162
+ - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
163
+ - Comment: There are 2 engine instances and the experts are split
164
+ between the 4 devices.
165
+ """
166
+
167
+ def flatten_tp_across_dp(dp_rank: int):
168
+ tp_rank = 0 if tp_size_ == 1 else get_tensor_model_parallel_rank()
169
+ # There are actually dp_size_ * tp_size_ devices. Update tp_size
170
+ # and tp_rank so we shard across all devices.
171
+ tp_size = dp_size_ * tp_size_
172
+ tp_rank = dp_rank * tp_size_ + tp_rank
173
+ return tp_size, tp_rank
174
+
175
+ use_ep = (dp_size_ * tp_size_ > 1
176
+ and vllm_parallel_config.enable_expert_parallel)
177
+
178
+ dp_size = dp_size_
179
+ dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
180
+ tp_size, tp_rank = flatten_tp_across_dp(dp_rank)
181
+
182
+ if not use_ep:
183
+ return FusedMoEParallelConfig(tp_size=tp_size,
184
+ tp_rank=tp_rank,
185
+ dp_size=dp_size,
186
+ dp_rank=dp_rank,
187
+ ep_size=1,
188
+ ep_rank=0,
189
+ use_ep=False)
190
+ # DP + EP / TP + EP / DP + TP + EP
191
+ assert use_ep
192
+ # In EP, each device owns a set of experts fully. There is no tensor
193
+ # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
194
+ ep_size = tp_size
195
+ ep_rank = tp_rank
196
+ return FusedMoEParallelConfig(tp_size=1,
197
+ tp_rank=0,
198
+ dp_size=dp_size,
199
+ dp_rank=dp_rank,
200
+ ep_size=ep_size,
201
+ ep_rank=ep_rank,
202
+ use_ep=True)
203
+
204
+
205
+ # Adapted from pplx-kernels tests/all_to_all_utils.py
206
+ @dataclass
207
+ class MoEConfig:
208
+ num_experts: int
209
+ experts_per_token: int
210
+ hidden_dim: int
211
+
212
+ num_local_experts: int
213
+ moe_parallel_config: FusedMoEParallelConfig
214
+
215
+ in_dtype: torch.dtype # The activation type.
216
+ quant_dtype: torch.dtype = None
217
+
218
+ # TODO: add more quantization params, blocked, per-token, etc.
219
+ block_size: int = 128
220
+
221
+ max_num_tokens: int = MOE_DP_CHUNK_SIZE
222
+
223
+ @property
224
+ def tp_size(self):
225
+ return self.moe_parallel_config.tp_size
226
+
227
+ @property
228
+ def dp_size(self):
229
+ return self.moe_parallel_config.dp_size
230
+
231
+ @property
232
+ def ep_size(self):
233
+ return self.moe_parallel_config.ep_size
234
+
235
+ @property
236
+ def tp_rank(self):
237
+ return self.moe_parallel_config.tp_rank
238
+
239
+ @property
240
+ def dp_rank(self):
241
+ return self.moe_parallel_config.dp_rank
242
+
243
+ @property
244
+ def ep_rank(self):
245
+ return self.moe_parallel_config.ep_rank
246
+
247
+ @property
248
+ def use_ep(self):
249
+ return self.moe_parallel_config.use_ep
250
+
251
+ @property
252
+ def use_pplx_kernels(self):
253
+ return self.moe_parallel_config.use_pplx_kernels
254
+
255
+ @property
256
+ def use_deepep_ht_kernels(self):
257
+ return self.moe_parallel_config.use_deepep_ht_kernels
258
+
259
+ @property
260
+ def use_deepep_ll_kernels(self):
261
+ return self.moe_parallel_config.use_deepep_ll_kernels
262
+
263
+
264
+ class FusedMoeWeightScaleSupported(Enum):
265
+ TENSOR = "tensor"
266
+ CHANNEL = "channel"
267
+ GROUP = "group"
268
+ BLOCK = "block"
269
+
270
+
271
+ def get_quant_config_input_activations(
272
+ quant_config: Optional[QuantizationConfig]
273
+ ) -> Optional[QuantizationArgs]:
274
+ if (quant_config is not None and hasattr(quant_config, 'target_scheme_map')
275
+ and "Linear" in quant_config.target_scheme_map and
276
+ "input_activations" in quant_config.target_scheme_map["Linear"]):
277
+ return quant_config.target_scheme_map["Linear"].get(
278
+ "input_activations")
279
+ else:
280
+ return None
281
+
282
+
283
+ class FusedMoEMethodBase(QuantizeMethodBase):
284
+
285
+ moe: MoEConfig
286
+
287
+ @abstractmethod
288
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
289
+ hidden_size: int, intermediate_size_per_partition: int,
290
+ params_dtype: torch.dtype, **extra_weight_attrs):
291
+ raise NotImplementedError
292
+
293
+ def init_prepare_finalize(self, moe: MoEConfig,
294
+ quant_config: Optional[QuantizationConfig]):
295
+ all2all_manager = get_ep_group().device_communicator.all2all_manager
296
+ assert all2all_manager is not None
297
+
298
+ self.moe = moe
299
+ quant_dtype = None
300
+ act_quant_block_size = None
301
+ from vllm.model_executor.layers.quantization.fp8 import Fp8Config
302
+ if isinstance(quant_config, Fp8Config):
303
+ act_quant_block_size = quant_config.weight_block_size
304
+ quant_dtype = torch.float8_e4m3fn
305
+
306
+ prepare_finalize: Optional[Union[PplxPrepareAndFinalize,
307
+ DeepEPHTPrepareAndFinalize,
308
+ DeepEPLLPrepareAndFinalize]] = None
309
+ if moe.use_pplx_kernels:
310
+ all_to_all_args = dict(
311
+ max_num_tokens=moe.max_num_tokens,
312
+ num_experts=moe.num_experts,
313
+ experts_per_token=moe.experts_per_token, # topk
314
+ rank=all2all_manager.rank,
315
+ world_size=all2all_manager.world_size,
316
+ # dp_size actually means tp_size, bug in pplx kernels
317
+ dp_size=all2all_manager.tp_group.world_size,
318
+ hidden_dim=moe.hidden_dim,
319
+ hidden_dim_bytes=moe.hidden_dim * moe.quant_dtype.itemsize,
320
+ # For blocked per token: set to
321
+ # ceil_div(hidden_dim, block_size) * sizeof(float32)
322
+ # For per-token: set to sizeof(float32)
323
+ hidden_dim_scale_bytes=(
324
+ 0 if moe.quant_dtype.itemsize != 1 else
325
+ ((moe.hidden_dim + moe.block_size - 1) // moe.block_size *
326
+ torch.float32.itemsize)),
327
+ )
328
+
329
+ # Intranode pplx a2a takes a group name while internode does not.
330
+ if not all2all_manager.internode:
331
+ all_to_all_args[
332
+ "group_name"] = all2all_manager.cpu_group.group_name
333
+
334
+ handle = all2all_manager.get_handle(all_to_all_args)
335
+
336
+ input_activations = get_quant_config_input_activations(
337
+ quant_config)
338
+
339
+ prepare_finalize = PplxPrepareAndFinalize(
340
+ handle,
341
+ max_num_tokens=moe.max_num_tokens,
342
+ world_size=all2all_manager.world_size,
343
+ rank=all2all_manager.rank,
344
+ # dp_size actually means tp_size, bug in pplx kernels
345
+ dp_size=all2all_manager.tp_group.world_size,
346
+ quant_dtype=moe.quant_dtype,
347
+ per_act_token=(input_activations.strategy
348
+ == QuantizationStrategy.TOKEN
349
+ if input_activations is not None else False),
350
+ )
351
+ elif moe.use_deepep_ht_kernels:
352
+ assert moe.dp_size == all2all_manager.dp_world_size
353
+
354
+ all_to_all_args = dict()
355
+ handle = all2all_manager.get_handle(all_to_all_args)
356
+ prepare_finalize = DeepEPHTPrepareAndFinalize(
357
+ handle,
358
+ world_size=all2all_manager.world_size,
359
+ rank=all2all_manager.rank,
360
+ dp_size=all2all_manager.dp_world_size,
361
+ rank_expert_offset=all2all_manager.rank *
362
+ moe.num_local_experts,
363
+ quant_dtype=quant_dtype,
364
+ block_shape=act_quant_block_size,
365
+ )
366
+
367
+ elif moe.use_deepep_ll_kernels:
368
+ assert moe.dp_size == all2all_manager.dp_world_size
369
+
370
+ all_to_all_args = dict(
371
+ max_num_tokens_per_dp_rank=moe.max_num_tokens,
372
+ token_hidden_size=moe.hidden_dim,
373
+ num_ep_ranks=all2all_manager.world_size,
374
+ num_global_experts=moe.num_experts,
375
+ num_local_experts=moe.num_experts //
376
+ all2all_manager.world_size)
377
+ handle = all2all_manager.get_handle(all_to_all_args)
378
+
379
+ # Note (varun): Whether to use FP8 dispatch or not needs some
380
+ # profiling. Turning it off for now.
381
+ prepare_finalize = DeepEPLLPrepareAndFinalize(
382
+ handle,
383
+ world_size=all2all_manager.world_size,
384
+ dp_size=all2all_manager.dp_world_size,
385
+ max_tokens_per_rank=moe.max_num_tokens,
386
+ quant_dtype=quant_dtype,
387
+ block_shape=act_quant_block_size,
388
+ use_fp8_dispatch=False,
389
+ )
390
+
391
+ self.topk_indices_dtype = None
392
+ if prepare_finalize is not None:
393
+ self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
394
+ experts = self.select_gemm_impl(prepare_finalize, moe)
395
+ self.fused_experts = FusedMoEModularKernel(
396
+ prepare_finalize,
397
+ experts,
398
+ )
399
+
400
+ def select_gemm_impl(
401
+ self, prepare_finalize: FusedMoEPrepareAndFinalize,
402
+ moe: Optional[MoEConfig]) -> FusedMoEPermuteExpertsUnpermute:
403
+ # based on the all2all implementation, select the appropriate
404
+ # gemm implementation
405
+ raise NotImplementedError(
406
+ "Subclass must select appropriate gemm implementation"
407
+ " based on the prepare_finalize")
408
+
409
+ @abstractmethod
410
+ def apply(
411
+ self,
412
+ layer: torch.nn.Module,
413
+ x: torch.Tensor,
414
+ router_logits: torch.Tensor,
415
+ top_k: int,
416
+ renormalize: bool,
417
+ use_grouped_topk: bool = False,
418
+ topk_group: Optional[int] = None,
419
+ num_expert_group: Optional[int] = None,
420
+ global_num_experts: int = -1,
421
+ expert_map: Optional[torch.Tensor] = None,
422
+ custom_routing_function: Optional[Callable] = None,
423
+ scoring_func: str = "softmax",
424
+ e_score_correction_bias: Optional[torch.Tensor] = None,
425
+ apply_router_weight_on_input: bool = False,
426
+ activation: str = "silu",
427
+ ) -> torch.Tensor:
428
+ raise NotImplementedError
429
+
430
+
431
+ @CustomOp.register("unquantized_fused_moe")
432
+ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
433
+ """MoE method without quantization."""
434
+
435
+ def __init__(self, moe: MoEConfig):
436
+ super().__init__()
437
+ self.fused_experts = fused_experts # type: ignore
438
+ self.topk_indices_dtype = None
439
+ self.moe = moe
440
+
441
+ self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
442
+ if self.rocm_aiter_moe_enabled:
443
+ from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
444
+ self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
445
+ else:
446
+ self.rocm_aiter_fused_experts = None # type: ignore
447
+
448
+ def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize,
449
+ moe: Optional[MoEConfig]):
450
+
451
+ assert self.fused_experts == fused_experts
452
+
453
+ all2all_manager = get_ep_group().device_communicator.all2all_manager
454
+ assert all2all_manager is not None
455
+
456
+ experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
457
+
458
+ use_batched_experts = prepare_finalize.max_num_tokens_per_rank(
459
+ ) is not None
460
+ if use_batched_experts:
461
+ logger.debug("BatchedTritonExperts %s", self.moe)
462
+ assert self.moe.dp_size == all2all_manager.dp_world_size
463
+ experts = BatchedTritonExperts(
464
+ max_num_tokens=self.moe.max_num_tokens,
465
+ world_size=all2all_manager.world_size,
466
+ # dp_size actually means tp_size, bug in pplx kernels
467
+ dp_size=all2all_manager.tp_group.world_size,
468
+ use_fp8_w8a8=False,
469
+ use_int8_w8a8=False,
470
+ use_int8_w8a16=False,
471
+ use_int4_w4a16=False,
472
+ block_shape=None,
473
+ per_channel_quant=False,
474
+ )
475
+ else:
476
+ logger.debug("TritonExperts %s", self.moe)
477
+ experts = TritonExperts(
478
+ use_fp8_w8a8=False,
479
+ use_int8_w8a8=False,
480
+ use_int8_w8a16=False,
481
+ use_int4_w4a16=False,
482
+ block_shape=None,
483
+ per_channel_quant=False,
484
+ )
485
+ return experts
486
+
487
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
488
+ hidden_size: int, intermediate_size_per_partition: int,
489
+ params_dtype: torch.dtype, **extra_weight_attrs):
490
+ # Fused gate_up_proj (column parallel)
491
+ w13_weight = torch.nn.Parameter(torch.empty(
492
+ num_experts,
493
+ 2 * intermediate_size_per_partition,
494
+ hidden_size,
495
+ dtype=params_dtype),
496
+ requires_grad=False)
497
+ layer.register_parameter("w13_weight", w13_weight)
498
+ set_weight_attrs(w13_weight, extra_weight_attrs)
499
+
500
+ # down_proj (row parallel)
501
+ w2_weight = torch.nn.Parameter(torch.empty(
502
+ num_experts,
503
+ hidden_size,
504
+ intermediate_size_per_partition,
505
+ dtype=params_dtype),
506
+ requires_grad=False)
507
+ layer.register_parameter("w2_weight", w2_weight)
508
+ set_weight_attrs(w2_weight, extra_weight_attrs)
509
+
510
+ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
511
+ # Pad the weight tensor. This is an optimization on ROCm platform, which
512
+ # can benefit from tensors located far enough from one another in memory
513
+ if (envs.VLLM_ROCM_MOE_PADDING and current_platform.is_rocm()
514
+ and weight.stride(-1) == 1
515
+ and (weight.stride(-2) * weight.element_size()) % 512 == 0):
516
+ num_pad = 256 // weight.element_size()
517
+ weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
518
+ torch.cuda.empty_cache()
519
+ return weight
520
+
521
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
522
+ super().process_weights_after_loading(layer)
523
+
524
+ # Padding the weight for better performance on ROCm
525
+ layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
526
+ layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
527
+ # Lazy import to avoid importing triton.
528
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
529
+ shuffle_weights)
530
+
531
+ if self.rocm_aiter_moe_enabled:
532
+ shuffled_w13, shuffled_w2 = shuffle_weights(
533
+ layer.w13_weight.data, layer.w2_weight.data)
534
+
535
+ layer.w13_weight.data = shuffled_w13
536
+ layer.w2_weight.data = shuffled_w2
537
+
538
+ if current_platform.is_cpu():
539
+ if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
540
+ import intel_extension_for_pytorch as ipex
541
+ layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
542
+ layer.w13_weight,
543
+ layer.w2_weight,
544
+ use_prepack=envs.VLLM_CPU_MOE_PREPACK,
545
+ )
546
+ else:
547
+ raise NotImplementedError("CPU MOE only supports x86 arch.")
548
+
549
+ def apply(
550
+ self,
551
+ layer: torch.nn.Module,
552
+ x: torch.Tensor,
553
+ router_logits: torch.Tensor,
554
+ top_k: int,
555
+ renormalize: bool,
556
+ use_grouped_topk: bool = False,
557
+ topk_group: Optional[int] = None,
558
+ num_expert_group: Optional[int] = None,
559
+ global_num_experts: int = -1,
560
+ expert_map: Optional[torch.Tensor] = None,
561
+ custom_routing_function: Optional[Callable] = None,
562
+ scoring_func: str = "softmax",
563
+ e_score_correction_bias: Optional[torch.Tensor] = None,
564
+ apply_router_weight_on_input: bool = False,
565
+ activation: str = "silu",
566
+ ) -> torch.Tensor:
567
+ return self.forward(
568
+ x=x,
569
+ layer=layer,
570
+ router_logits=router_logits,
571
+ top_k=top_k,
572
+ renormalize=renormalize,
573
+ use_grouped_topk=use_grouped_topk,
574
+ topk_group=topk_group,
575
+ num_expert_group=num_expert_group,
576
+ global_num_experts=global_num_experts,
577
+ expert_map=expert_map,
578
+ custom_routing_function=custom_routing_function,
579
+ scoring_func=scoring_func,
580
+ e_score_correction_bias=e_score_correction_bias,
581
+ activation=activation,
582
+ apply_router_weight_on_input=apply_router_weight_on_input)
583
+
584
+ def forward_cuda(
585
+ self,
586
+ layer: torch.nn.Module,
587
+ x: torch.Tensor,
588
+ use_grouped_topk: bool,
589
+ top_k: int,
590
+ router_logits: torch.Tensor,
591
+ renormalize: bool,
592
+ topk_group: Optional[int] = None,
593
+ num_expert_group: Optional[int] = None,
594
+ global_num_experts: int = -1,
595
+ expert_map: Optional[torch.Tensor] = None,
596
+ custom_routing_function: Optional[Callable] = None,
597
+ scoring_func: str = "softmax",
598
+ e_score_correction_bias: Optional[torch.Tensor] = None,
599
+ apply_router_weight_on_input: bool = False,
600
+ activation: str = "silu",
601
+ ) -> torch.Tensor:
602
+
603
+ topk_weights, topk_ids = FusedMoE.select_experts(
604
+ hidden_states=x,
605
+ router_logits=router_logits,
606
+ use_grouped_topk=use_grouped_topk,
607
+ top_k=top_k,
608
+ renormalize=renormalize,
609
+ topk_group=topk_group,
610
+ num_expert_group=num_expert_group,
611
+ custom_routing_function=custom_routing_function,
612
+ scoring_func=scoring_func,
613
+ e_score_correction_bias=e_score_correction_bias,
614
+ indices_type=self.topk_indices_dtype)
615
+
616
+ if self.rocm_aiter_moe_enabled:
617
+ assert expert_map is None
618
+ return self.rocm_aiter_fused_experts(
619
+ hidden_states=x,
620
+ w1=layer.w13_weight,
621
+ w2=layer.w2_weight,
622
+ topk_weights=topk_weights,
623
+ topk_ids=topk_ids,
624
+ activation=activation,
625
+ apply_router_weight_on_input=apply_router_weight_on_input)
626
+ else:
627
+ return self.fused_experts(
628
+ hidden_states=x,
629
+ w1=layer.w13_weight,
630
+ w2=layer.w2_weight,
631
+ topk_weights=topk_weights,
632
+ topk_ids=topk_ids,
633
+ inplace=True,
634
+ activation=activation,
635
+ apply_router_weight_on_input=apply_router_weight_on_input,
636
+ global_num_experts=global_num_experts,
637
+ expert_map=expert_map,
638
+ )
639
+
640
+ def forward_cpu(
641
+ self,
642
+ layer: torch.nn.Module,
643
+ x: torch.Tensor,
644
+ use_grouped_topk: bool,
645
+ top_k: int,
646
+ router_logits: torch.Tensor,
647
+ renormalize: bool,
648
+ topk_group: Optional[int] = None,
649
+ num_expert_group: Optional[int] = None,
650
+ global_num_experts: int = -1,
651
+ expert_map: Optional[torch.Tensor] = None,
652
+ custom_routing_function: Optional[Callable] = None,
653
+ scoring_func: str = "softmax",
654
+ e_score_correction_bias: Optional[torch.Tensor] = None,
655
+ activation: str = "silu",
656
+ apply_router_weight_on_input: bool = False,
657
+ **kwargs,
658
+ ):
659
+ assert activation == "silu", f"{activation} is not supported."
660
+ assert apply_router_weight_on_input is False
661
+ return layer.ipex_fusion(
662
+ x,
663
+ use_grouped_topk,
664
+ top_k,
665
+ router_logits,
666
+ renormalize,
667
+ topk_group,
668
+ num_expert_group,
669
+ custom_routing_function,
670
+ scoring_func,
671
+ e_score_correction_bias,
672
+ )
673
+
674
+ def forward_hpu(
675
+ self,
676
+ layer: torch.nn.Module,
677
+ x: torch.Tensor,
678
+ use_grouped_topk: bool,
679
+ top_k: int,
680
+ router_logits: torch.Tensor,
681
+ renormalize: bool,
682
+ topk_group: Optional[int] = None,
683
+ num_expert_group: Optional[int] = None,
684
+ global_num_experts: int = -1,
685
+ expert_map: Optional[torch.Tensor] = None,
686
+ custom_routing_function: Optional[Callable] = None,
687
+ scoring_func: str = "softmax",
688
+ e_score_correction_bias: Optional[torch.Tensor] = None,
689
+ apply_router_weight_on_input: bool = False,
690
+ activation: str = "silu",
691
+ ) -> torch.Tensor:
692
+ assert not use_grouped_topk
693
+ assert num_expert_group is None
694
+ assert topk_group is None
695
+ assert custom_routing_function is None
696
+ assert layer is not None
697
+ assert apply_router_weight_on_input is False
698
+ if scoring_func != "softmax":
699
+ raise NotImplementedError(
700
+ "Only softmax scoring function is supported for HPU.")
701
+ if e_score_correction_bias is not None:
702
+ raise NotImplementedError(
703
+ "Expert score correction bias is not supported for HPU.")
704
+ return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
705
+ router_logits, top_k)
706
+
707
+ def forward_tpu(
708
+ self,
709
+ layer: torch.nn.Module,
710
+ x: torch.Tensor,
711
+ use_grouped_topk: bool,
712
+ top_k: int,
713
+ router_logits: torch.Tensor,
714
+ renormalize: bool,
715
+ topk_group: Optional[int] = None,
716
+ num_expert_group: Optional[int] = None,
717
+ global_num_experts: int = -1,
718
+ expert_map: Optional[torch.Tensor] = None,
719
+ custom_routing_function: Optional[Callable] = None,
720
+ scoring_func: str = "softmax",
721
+ e_score_correction_bias: Optional[torch.Tensor] = None,
722
+ apply_router_weight_on_input: bool = False,
723
+ activation: str = "silu",
724
+ ) -> torch.Tensor:
725
+ assert not use_grouped_topk
726
+ assert num_expert_group is None
727
+ assert topk_group is None
728
+ assert custom_routing_function is None
729
+ assert apply_router_weight_on_input is False
730
+ if scoring_func != "softmax":
731
+ raise NotImplementedError(
732
+ "Only softmax scoring function is supported for TPU.")
733
+ if e_score_correction_bias is not None:
734
+ raise NotImplementedError(
735
+ "Expert score correction bias is not supported for TPU.")
736
+ assert activation == "silu", f"{activation} is not supported for TPU."
737
+ return fused_moe_pallas(hidden_states=x,
738
+ w1=layer.w13_weight,
739
+ w2=layer.w2_weight,
740
+ topk=top_k,
741
+ gating_output=router_logits,
742
+ global_num_experts=global_num_experts,
743
+ expert_map=expert_map,
744
+ renormalize=renormalize)
745
+
746
+ forward_native = forward_tpu if current_platform.is_tpu() else forward_cuda
747
+
748
+
749
+ def determine_expert_map(
750
+ ep_size: int, ep_rank: int,
751
+ global_num_experts: int) -> tuple[int, Optional[torch.Tensor]]:
752
+ """
753
+ Calculates how many experts should be assigned to each rank for EP and
754
+ creates a mapping from global to local expert index. Experts are
755
+ distributed evenly across ranks. Any remaining are assigned to the
756
+ last rank.
757
+
758
+ Args:
759
+ ep_size (int): The size of the expert parallel group
760
+ global_num_experts (int): The total number of experts in the model.
761
+
762
+ Returns:
763
+ tuple[int, Optional[torch.Tensor]]: A tuple containing:
764
+ - local_num_experts (int): The number of experts assigned
765
+ to the current rank.
766
+ - expert_map (Optional[torch.Tensor]): A tensor of shape
767
+ (global_num_experts,) mapping from global to local index.
768
+ Contains -1 for experts not assigned to the current rank.
769
+ Returns None if ep_size is 1.
770
+ """
771
+ assert ep_size > 0
772
+ if ep_size == 1:
773
+ return (global_num_experts, None)
774
+
775
+ local_num_experts = global_num_experts // ep_size
776
+
777
+ # Create a tensor of size num_experts filled with -1
778
+ expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
779
+ # Create a expert map for the local experts
780
+ if ep_rank < (ep_size - 1):
781
+ # Each non-last rank gets local_num_experts experts.
782
+ expert_map[ep_rank * local_num_experts:
783
+ (ep_rank + 1) * local_num_experts] = \
784
+ torch.arange(0, local_num_experts, dtype=torch.int32)
785
+ else:
786
+ # All remaining experts are assigned to the last rank.
787
+ local_num_experts = (global_num_experts - ep_rank * local_num_experts)
788
+
789
+ expert_map[-local_num_experts:] = \
790
+ torch.arange(0, local_num_experts, dtype=torch.int32)
791
+ return (local_num_experts, expert_map)
792
+
793
+
794
+ class FusedMoE(torch.nn.Module):
795
+ """FusedMoE layer for MoE models.
796
+
797
+ This layer contains both MergedColumnParallel weights (gate_up_proj /
798
+ w13) and RowParallelLinear weights (down_proj/ w2).
799
+
800
+ Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
801
+ copy that naming convention here and handle any remapping in the
802
+ load_weights function in each model implementation.
803
+
804
+ Args:
805
+ num_experts: Number of experts in the model
806
+ top_k: Number of experts selected for each token
807
+ hidden_size: Input hidden state size of the transformer
808
+ intermediate_size: Intermediate size of the experts
809
+ params_dtype: Data type for the parameters.
810
+ reduce_results: Whether to all all_reduce on the output of the layer
811
+ renomalize: Whether to renormalize the logits in the fused_moe kernel
812
+ quant_config: Quantization configure.
813
+ """
814
+
815
+ def __init__(
816
+ self,
817
+ num_experts: int, # Global number of experts
818
+ top_k: int,
819
+ hidden_size: int,
820
+ intermediate_size: int,
821
+ params_dtype: Optional[torch.dtype] = None,
822
+ reduce_results: bool = False,
823
+ renormalize: bool = True,
824
+ use_grouped_topk: bool = False,
825
+ num_expert_group: Optional[int] = None,
826
+ topk_group: Optional[int] = None,
827
+ quant_config: Optional[QuantizationConfig] = None,
828
+ tp_size: Optional[int] = None,
829
+ ep_size: Optional[int] = None,
830
+ dp_size: Optional[int] = None,
831
+ prefix: str = "",
832
+ custom_routing_function: Optional[Callable] = None,
833
+ scoring_func: str = "softmax",
834
+ e_score_correction_bias: Optional[torch.Tensor] = None,
835
+ apply_router_weight_on_input: bool = False,
836
+ activation: str = "silu",
837
+ ):
838
+ super().__init__()
839
+ if params_dtype is None:
840
+ params_dtype = torch.get_default_dtype()
841
+ self.params_dtype = params_dtype
842
+
843
+ vllm_config = get_current_vllm_config()
844
+ self.moe_parallel_config: FusedMoEParallelConfig = (
845
+ FusedMoEParallelConfig.make(
846
+ tp_size_=(tp_size if tp_size is not None else
847
+ get_tensor_model_parallel_world_size()),
848
+ dp_size_=(dp_size if dp_size is not None else
849
+ get_dp_group().world_size),
850
+ vllm_parallel_config=vllm_config.parallel_config))
851
+
852
+ self.global_num_experts = num_experts
853
+
854
+ # For smuggling this layer into the fused moe custom op
855
+ self.use_direct_call = self.dp_size == 1
856
+ if not self.use_direct_call:
857
+ compilation_config = vllm_config.compilation_config
858
+ if prefix in compilation_config.static_forward_context:
859
+ raise ValueError("Duplicate layer name: {}".format(prefix))
860
+ compilation_config.static_forward_context[prefix] = self
861
+ self.layer_name = prefix
862
+
863
+ # Determine expert maps
864
+ if self.use_ep:
865
+ self.local_num_experts, self.expert_map = determine_expert_map(
866
+ ep_size=self.ep_size,
867
+ ep_rank=self.ep_rank,
868
+ global_num_experts=self.global_num_experts)
869
+ else:
870
+ self.local_num_experts, self.expert_map = (self.global_num_experts,
871
+ None)
872
+
873
+ self.top_k = top_k
874
+
875
+ assert intermediate_size % self.tp_size == 0
876
+ self.hidden_size = hidden_size
877
+ self.intermediate_size_per_partition = intermediate_size // self.tp_size
878
+ self.reduce_results = reduce_results
879
+ self.renormalize = renormalize
880
+ self.use_grouped_topk = use_grouped_topk
881
+ if self.use_grouped_topk:
882
+ assert num_expert_group is not None and topk_group is not None
883
+ self.num_expert_group = num_expert_group
884
+ self.topk_group = topk_group
885
+ self.custom_routing_function = custom_routing_function
886
+ self.scoring_func = scoring_func
887
+ self.e_score_correction_bias = e_score_correction_bias
888
+ self.apply_router_weight_on_input = apply_router_weight_on_input
889
+ self.activation = activation
890
+
891
+ if self.scoring_func != "softmax" and not self.use_grouped_topk:
892
+ raise ValueError("Only softmax scoring function is supported for "
893
+ "non-grouped topk.")
894
+ if current_platform.is_hpu():
895
+ from vllm_hpu_extension.ops import DynamicFusedMOE
896
+ self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
897
+
898
+ # Only support float8 for now.
899
+ quant_dtype = params_dtype
900
+ if quant_config is not None:
901
+ input_activations = get_quant_config_input_activations(
902
+ quant_config)
903
+ if (input_activations is not None
904
+ and input_activations.num_bits == 8
905
+ and input_activations.type == QuantizationType.FLOAT):
906
+ quant_dtype = torch.float8_e4m3fn
907
+
908
+ moe = MoEConfig(
909
+ num_experts=self.global_num_experts,
910
+ experts_per_token=top_k,
911
+ hidden_dim=hidden_size,
912
+ num_local_experts=self.local_num_experts,
913
+ moe_parallel_config=self.moe_parallel_config,
914
+ in_dtype=params_dtype,
915
+ quant_dtype=quant_dtype,
916
+ max_num_tokens=MOE_DP_CHUNK_SIZE,
917
+ )
918
+ self.moe_config = moe
919
+ self.quant_config = quant_config
920
+
921
+ # Note: get_quant_method will look at the layer's local_num_experts
922
+ # for heuristic purposes, so it must be initialized first.
923
+ quant_method: Optional[QuantizeMethodBase] = None
924
+ quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
925
+ else quant_config.get_quant_method(self, prefix))
926
+
927
+ assert quant_method is not None
928
+ assert isinstance(quant_method, FusedMoEMethodBase)
929
+ self.quant_method = quant_method
930
+
931
+ moe_quant_params = {
932
+ "num_experts": self.local_num_experts,
933
+ "hidden_size": hidden_size,
934
+ "intermediate_size_per_partition":
935
+ self.intermediate_size_per_partition,
936
+ "params_dtype": params_dtype,
937
+ "weight_loader": self.weight_loader,
938
+ }
939
+ # need full intermediate size pre-sharding for WNA16 act order
940
+ if (self.quant_method.__class__.__name__
941
+ in ("GPTQMarlinMoEMethod",
942
+ "CompressedTensorsWNA16MarlinMoEMethod",
943
+ "CompressedTensorsWNA16MoEMethod")):
944
+ moe_quant_params["intermediate_size_full"] = intermediate_size
945
+
946
+ self.quant_method.create_weights(layer=self, **moe_quant_params)
947
+
948
+ # Chunked all2all staging tensor
949
+ self.batched_hidden_states: Optional[torch.Tensor] = None
950
+ self.batched_router_logits: Optional[torch.Tensor] = None
951
+ if (self.moe_parallel_config.use_pplx_kernels
952
+ or self.moe_parallel_config.use_deepep_ll_kernels):
953
+ act_dtype = vllm_config.model_config.dtype
954
+ self.batched_hidden_states = torch.zeros(
955
+ (MOE_DP_CHUNK_SIZE, self.hidden_size),
956
+ dtype=act_dtype,
957
+ device=torch.cuda.current_device())
958
+
959
+ self.batched_router_logits = torch.zeros(
960
+ (MOE_DP_CHUNK_SIZE, self.global_num_experts),
961
+ dtype=act_dtype,
962
+ device=torch.cuda.current_device())
963
+
964
+ @property
965
+ def tp_size(self):
966
+ return self.moe_parallel_config.tp_size
967
+
968
+ @property
969
+ def dp_size(self):
970
+ return self.moe_parallel_config.dp_size
971
+
972
+ @property
973
+ def ep_size(self):
974
+ return self.moe_parallel_config.ep_size
975
+
976
+ @property
977
+ def tp_rank(self):
978
+ return self.moe_parallel_config.tp_rank
979
+
980
+ @property
981
+ def dp_rank(self):
982
+ return self.moe_parallel_config.dp_rank
983
+
984
+ @property
985
+ def ep_rank(self):
986
+ return self.moe_parallel_config.ep_rank
987
+
988
+ @property
989
+ def use_ep(self):
990
+ return self.moe_parallel_config.use_ep
991
+
992
+ @property
993
+ def use_pplx_kernels(self):
994
+ return self.moe_parallel_config.use_pplx_kernels
995
+
996
+ @property
997
+ def use_deepep_ht_kernels(self):
998
+ return self.moe_parallel_config.use_deepep_ht_kernels
999
+
1000
+ @property
1001
+ def use_deepep_ll_kernels(self):
1002
+ return self.moe_parallel_config.use_deepep_ll_kernels
1003
+
1004
+ def _load_per_tensor_weight_scale(self, shard_id: str,
1005
+ param: torch.nn.Parameter,
1006
+ loaded_weight: torch.Tensor,
1007
+ expert_id: int):
1008
+ param_data = param.data
1009
+ # for per tensor weight quantization
1010
+ if shard_id in ("w1", "w3"):
1011
+ # We have to keep the weight scales of w1 and w3 because
1012
+ # we need to re-quantize w1/w3 weights after weight loading.
1013
+ idx = 0 if shard_id == "w1" else 1
1014
+ param_data[expert_id][idx] = loaded_weight
1015
+ # If we are in the row parallel case (down_proj)
1016
+ elif shard_id == "w2":
1017
+ param_data[expert_id] = loaded_weight
1018
+
1019
+ def _load_model_weight_or_group_weight_scale(self,
1020
+ shard_dim: int,
1021
+ expert_data: torch.Tensor,
1022
+ shard_id: str,
1023
+ loaded_weight: torch.Tensor,
1024
+ tp_rank: int,
1025
+ load_full_w2: bool = False):
1026
+ """
1027
+ Load grouped weight scales for group quantization or model weights
1028
+ :param shard_dim: dimension to shard
1029
+ :param expert_data: parameter for a particular expert
1030
+ :param shard_id: either w1, w2, or w3
1031
+ :param loaded_weight: checkpoint weight to load into the param
1032
+ :param tp_rank: tensor parallel rank
1033
+ :param load_full_w2: whether or not the w2 loaded should be sharded.
1034
+ """
1035
+ if shard_id == "w2":
1036
+ # In the case where we have actorder/g_idx, we do not partition the
1037
+ # w2 scales, as indicated by `load_full` argument, for all tp cases
1038
+ self._load_w2(shard_dim=shard_dim,
1039
+ loaded_weight=loaded_weight,
1040
+ expert_data=expert_data,
1041
+ tp_rank=tp_rank,
1042
+ load_full=load_full_w2)
1043
+ elif shard_id in ("w1", "w3"):
1044
+ self._load_w13(shard_id=shard_id,
1045
+ shard_dim=shard_dim,
1046
+ loaded_weight=loaded_weight,
1047
+ expert_data=expert_data,
1048
+ tp_rank=tp_rank)
1049
+
1050
+ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
1051
+ shard_dim: int, shard_id: str,
1052
+ loaded_weight: torch.Tensor,
1053
+ tp_rank: int):
1054
+ # for per channel weight quantization
1055
+ if shard_id == "w2":
1056
+ expert_data.copy_(loaded_weight)
1057
+ elif shard_id in ("w1", "w3"):
1058
+ self._load_w13(shard_id=shard_id,
1059
+ shard_dim=shard_dim,
1060
+ loaded_weight=loaded_weight,
1061
+ expert_data=expert_data,
1062
+ tp_rank=tp_rank)
1063
+
1064
+ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
1065
+ shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
1066
+
1067
+ # Index the loaded weight for tp sharding.
1068
+ # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
1069
+ shard_size = expert_data.shape[shard_dim] // 2
1070
+ loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
1071
+ shard_size)
1072
+ # Narrow parameter and load.
1073
+ # w1, gate_proj: Load into first logical weight of w13.
1074
+ if shard_id == "w1":
1075
+ expert_data = expert_data.narrow(shard_dim, 0, shard_size)
1076
+ # w3, up_proj: Load into second logical weight of w13.
1077
+ else:
1078
+ assert shard_id == "w3"
1079
+ expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
1080
+ expert_data.copy_(loaded_weight)
1081
+
1082
+ def _load_w2(self,
1083
+ expert_data: torch.Tensor,
1084
+ shard_dim: int,
1085
+ loaded_weight: torch.Tensor,
1086
+ tp_rank: int,
1087
+ load_full: bool = False):
1088
+
1089
+ # Index the loaded weight for tp sharding.
1090
+ # down_proj: "RowParallel" so tp sharding on input_dim
1091
+ # Narrow parameter and load.
1092
+ shard_size = expert_data.shape[shard_dim]
1093
+ if not load_full:
1094
+ loaded_weight = loaded_weight.narrow(shard_dim,
1095
+ shard_size * tp_rank,
1096
+ shard_size)
1097
+ # w2, down_proj: Load into only logical weight of w2.
1098
+ expert_data.copy_(loaded_weight)
1099
+
1100
+ def _load_single_value(self, param: torch.nn.Parameter,
1101
+ loaded_weight: torch.Tensor, expert_id: int):
1102
+ param_data = param.data
1103
+
1104
+ # Input scales can be loaded directly and should be equal.
1105
+ param_data[expert_id] = loaded_weight
1106
+
1107
+ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
1108
+ shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
1109
+
1110
+ if shard_id == "w2":
1111
+ self._load_w2(shard_dim=shard_dim,
1112
+ loaded_weight=loaded_weight,
1113
+ expert_data=expert_data,
1114
+ tp_rank=tp_rank)
1115
+ else:
1116
+ assert shard_id in ("w1", "w3")
1117
+ expert_data.copy_(loaded_weight)
1118
+
1119
+ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
1120
+ if self.expert_map is None:
1121
+ return expert_id
1122
+ return self.expert_map[expert_id].item()
1123
+
1124
+ def weight_loader(self, param: torch.nn.Parameter,
1125
+ loaded_weight: torch.Tensor, weight_name: str,
1126
+ shard_id: str, expert_id: int) -> None:
1127
+
1128
+ expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
1129
+ if expert_id == -1:
1130
+ return
1131
+ quant_method_name = self.quant_method.__class__.__name__
1132
+ # compressed-tensors checkpoints with packed weights are stored flipped
1133
+ # TODO (mgoin): check self.quant_method.quant_config.quant_format
1134
+ # against known CompressionFormat enum values that have this quality
1135
+ if self.quant_method.__class__.__name__ in (
1136
+ "CompressedTensorsWNA16MarlinMoEMethod",
1137
+ "CompressedTensorsWNA16MoEMethod"):
1138
+ loaded_weight = loaded_weight.t().contiguous()
1139
+
1140
+ if shard_id not in ("w1", "w2", "w3"):
1141
+ raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
1142
+ f"got {shard_id}.")
1143
+
1144
+ WEIGHT_SCALE_SUPPORTED = [
1145
+ e.value for e in FusedMoeWeightScaleSupported
1146
+ ]
1147
+ # Fetch the dim to shard the parameter/loaded weight
1148
+ # based on the shard id. This will be whatever
1149
+ # dimension intermediate_size_per_partition is used.
1150
+ SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
1151
+
1152
+ is_gguf_weight = getattr(param, "is_gguf_weight", False)
1153
+ is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
1154
+ if is_gguf_weight_type:
1155
+ param.weight_type = loaded_weight.item()
1156
+ param.data.copy_(loaded_weight)
1157
+ return
1158
+
1159
+ # is_transposed: if the dim to shard the weight
1160
+ # should be flipped. Required by GPTQ, compressed-tensors
1161
+ # should be whatever dimension intermediate_size_per_partition is
1162
+ is_transposed = getattr(param, "is_transposed", False)
1163
+ shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
1164
+ if is_transposed:
1165
+ shard_dim = int(not shard_dim)
1166
+
1167
+ full_load = len(loaded_weight.shape) == 3
1168
+ if full_load:
1169
+ shard_dim += 1
1170
+
1171
+ # Materialize GGUF UninitializedParameter
1172
+ if is_gguf_weight and isinstance(param, UninitializedParameter):
1173
+ final_shape = list(loaded_weight.shape)
1174
+ if shard_id in ["w1", "w3"]:
1175
+ final_shape[1] *= 2
1176
+ final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
1177
+ param.materialize(final_shape, dtype=loaded_weight.dtype)
1178
+
1179
+ expert_data = param.data if full_load else param.data[expert_id]
1180
+ # Case input scale: input_scale loading is only supported for fp8
1181
+ if "input_scale" in weight_name:
1182
+ # this is needed for compressed-tensors only
1183
+ loaded_weight = loaded_weight.to(param.data.device)
1184
+
1185
+ if ("compressed" in quant_method_name.lower()
1186
+ and param.data[expert_id] != 1
1187
+ and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
1188
+ raise ValueError(
1189
+ "input_scales of w1 and w3 of a layer "
1190
+ f"must be equal. But got {param.data[expert_id]} "
1191
+ f"vs. {loaded_weight}")
1192
+
1193
+ self._load_single_value(param=param,
1194
+ loaded_weight=loaded_weight,
1195
+ expert_id=expert_id)
1196
+ return
1197
+
1198
+ # Case g_idx
1199
+ if "g_idx" in weight_name:
1200
+ self._load_g_idx(shard_dim=0,
1201
+ shard_id=shard_id,
1202
+ loaded_weight=loaded_weight,
1203
+ expert_data=expert_data,
1204
+ tp_rank=self.tp_rank)
1205
+ return
1206
+
1207
+ if "ModelOpt" in quant_method_name:
1208
+ if ('weight_scale_2' in weight_name
1209
+ or 'input_scale' in weight_name):
1210
+ self._load_per_tensor_weight_scale(shard_id=shard_id,
1211
+ param=param,
1212
+ loaded_weight=loaded_weight,
1213
+ expert_id=expert_id)
1214
+ elif "weight" in weight_name:
1215
+ self._load_model_weight_or_group_weight_scale(
1216
+ shard_id=shard_id,
1217
+ shard_dim=shard_dim,
1218
+ loaded_weight=loaded_weight,
1219
+ expert_data=expert_data,
1220
+ tp_rank=self.tp_rank)
1221
+ return
1222
+
1223
+ # Case weight scales, zero_points and offset
1224
+ if ("scale" in weight_name or "zero" in weight_name
1225
+ or "offset" in weight_name):
1226
+ # load the weight scales and zp based on the quantization scheme
1227
+ # supported weight scales/zp can be found in
1228
+ # FusedMoeWeightScaleSupported
1229
+ # TODO @dsikka: once hardened, refactor to use vLLM Parameters
1230
+ # specific to each case
1231
+ quant_method = getattr(param, "quant_method", None)
1232
+ if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
1233
+ self._load_per_channel_weight_scale(
1234
+ shard_id=shard_id,
1235
+ shard_dim=shard_dim,
1236
+ loaded_weight=loaded_weight,
1237
+ expert_data=expert_data,
1238
+ tp_rank=self.tp_rank)
1239
+ elif quant_method in [
1240
+ FusedMoeWeightScaleSupported.GROUP.value,
1241
+ FusedMoeWeightScaleSupported.BLOCK.value,
1242
+ ]:
1243
+ self._load_model_weight_or_group_weight_scale(
1244
+ shard_id=shard_id,
1245
+ shard_dim=shard_dim,
1246
+ loaded_weight=loaded_weight,
1247
+ expert_data=expert_data,
1248
+ tp_rank=self.tp_rank,
1249
+ load_full_w2=getattr(param, "load_full_w2", False))
1250
+ elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
1251
+ self._load_per_tensor_weight_scale(shard_id=shard_id,
1252
+ param=param,
1253
+ loaded_weight=loaded_weight,
1254
+ expert_id=expert_id)
1255
+ else:
1256
+ raise ValueError(
1257
+ f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
1258
+ return
1259
+
1260
+ # Case weight_shape
1261
+ if "weight_shape" in weight_name:
1262
+ # only required by compressed-tensors
1263
+ self._load_single_value(param=param,
1264
+ loaded_weight=loaded_weight,
1265
+ expert_id=expert_id)
1266
+ return
1267
+
1268
+ # Case model weights
1269
+ if "weight" in weight_name:
1270
+ self._load_model_weight_or_group_weight_scale(
1271
+ shard_id=shard_id,
1272
+ shard_dim=shard_dim,
1273
+ loaded_weight=loaded_weight,
1274
+ expert_data=expert_data,
1275
+ tp_rank=self.tp_rank)
1276
+ return
1277
+
1278
+ @staticmethod
1279
+ def select_experts(hidden_states: torch.Tensor,
1280
+ router_logits: torch.Tensor,
1281
+ top_k: int,
1282
+ use_grouped_topk: bool,
1283
+ renormalize: bool,
1284
+ topk_group: Optional[int] = None,
1285
+ num_expert_group: Optional[int] = None,
1286
+ custom_routing_function: Optional[Callable] = None,
1287
+ scoring_func: str = "softmax",
1288
+ e_score_correction_bias: Optional[torch.Tensor] = None,
1289
+ indices_type: Optional[torch.dtype] = None):
1290
+ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
1291
+
1292
+ # DeekSeekv2 uses grouped_top_k
1293
+ if use_grouped_topk:
1294
+ assert topk_group is not None
1295
+ assert num_expert_group is not None
1296
+ topk_weights, topk_ids = grouped_topk(
1297
+ hidden_states=hidden_states,
1298
+ gating_output=router_logits,
1299
+ topk=top_k,
1300
+ renormalize=renormalize,
1301
+ num_expert_group=num_expert_group,
1302
+ topk_group=topk_group,
1303
+ scoring_func=scoring_func,
1304
+ e_score_correction_bias=e_score_correction_bias)
1305
+ if indices_type is not None:
1306
+ topk_ids = topk_ids.to(dtype=indices_type)
1307
+ elif custom_routing_function is None:
1308
+ topk_weights, topk_ids, token_expert_indices = fused_topk(
1309
+ hidden_states=hidden_states,
1310
+ gating_output=router_logits,
1311
+ topk=top_k,
1312
+ renormalize=renormalize,
1313
+ indices_type=indices_type,
1314
+ )
1315
+ else:
1316
+ topk_weights, topk_ids = custom_routing_function(
1317
+ hidden_states=hidden_states,
1318
+ gating_output=router_logits,
1319
+ topk=top_k,
1320
+ renormalize=renormalize)
1321
+ if indices_type is not None:
1322
+ topk_ids = topk_ids.to(dtype=indices_type)
1323
+
1324
+ return topk_weights, topk_ids
1325
+
1326
+ def must_reduce_shared_expert_outputs(self) -> bool:
1327
+ """
1328
+ The shared_experts are typically computed using the RowParallelLinear
1329
+ layer. The result of this function is typically used as
1330
+ the reduce_results argument to the module.
1331
+ When just tensor-parallel is used, it is not required to reduce
1332
+ the shared_experts results immediately. Instead we reduce at the
1333
+ once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
1334
+ With EP and all2all kernels - this is no longer viable as all
1335
+ GPU ranks in DP, produce the complete set of hidden_states.
1336
+ Therefore it is required that we reduce the shared_experts output
1337
+ early.
1338
+ """
1339
+ return (self.use_pplx_kernels or self.use_deepep_ht_kernels
1340
+ or self.use_deepep_ll_kernels)
1341
+
1342
+ def maybe_all_reduce_tensor_model_parallel(
1343
+ self, final_hidden_states: torch.Tensor):
1344
+ """
1345
+ The pplx combine kernel reduces across GPU ranks by default.
1346
+ """
1347
+ if (self.use_pplx_kernels or self.use_deepep_ht_kernels
1348
+ or self.use_deepep_ll_kernels):
1349
+ return final_hidden_states
1350
+ else:
1351
+ return tensor_model_parallel_all_reduce(final_hidden_states)
1352
+
1353
+ def forward(self, hidden_states: torch.Tensor,
1354
+ router_logits: torch.Tensor):
1355
+ if self.use_direct_call:
1356
+ return self.forward_impl(hidden_states, router_logits)
1357
+ else:
1358
+ return torch.ops.vllm.moe_forward(hidden_states, router_logits,
1359
+ self.layer_name)
1360
+
1361
+ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
1362
+ full_router_logits: torch.Tensor):
1363
+ assert self.batched_hidden_states is not None
1364
+ assert self.batched_router_logits is not None
1365
+ assert self.batched_hidden_states.dtype == full_hidden_states.dtype
1366
+ assert self.batched_router_logits.dtype == full_router_logits.dtype
1367
+ # Check size compatibility.
1368
+ assert (
1369
+ self.batched_hidden_states.size(-1) == full_hidden_states.size(-1))
1370
+ assert (
1371
+ self.batched_router_logits.size(-1) == full_router_logits.size(-1))
1372
+
1373
+ full_final_hidden_states = torch.empty_like(full_hidden_states)
1374
+
1375
+ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
1376
+ chunk_size = chunk_end - chunk_start
1377
+ hidden_states = full_hidden_states[chunk_start:chunk_end, :]
1378
+ router_logits = full_router_logits[chunk_start:chunk_end, :]
1379
+
1380
+ assert (self.batched_hidden_states.size(0) # type: ignore
1381
+ >= chunk_size)
1382
+ assert (self.batched_router_logits.size(0) # type: ignore
1383
+ >= chunk_size)
1384
+ staged_hidden_states = self.batched_hidden_states[:
1385
+ chunk_size, :] # type: ignore
1386
+ staged_router_logits = self.batched_router_logits[:
1387
+ chunk_size, :] # type: ignore
1388
+ staged_hidden_states.copy_(hidden_states, non_blocking=True)
1389
+ staged_router_logits.copy_(router_logits, non_blocking=True)
1390
+
1391
+ # Matrix multiply.
1392
+ final_hidden_states = self.quant_method.apply(
1393
+ layer=self,
1394
+ x=staged_hidden_states,
1395
+ router_logits=staged_router_logits,
1396
+ top_k=self.top_k,
1397
+ renormalize=self.renormalize,
1398
+ use_grouped_topk=self.use_grouped_topk,
1399
+ global_num_experts=self.global_num_experts,
1400
+ expert_map=self.expert_map,
1401
+ topk_group=self.topk_group,
1402
+ num_expert_group=self.num_expert_group,
1403
+ custom_routing_function=self.custom_routing_function,
1404
+ scoring_func=self.scoring_func,
1405
+ e_score_correction_bias=self.e_score_correction_bias,
1406
+ activation=self.activation,
1407
+ )
1408
+
1409
+ if not skip_result_store:
1410
+ full_final_hidden_states[chunk_start:chunk_end, :].copy_(
1411
+ final_hidden_states, non_blocking=True)
1412
+
1413
+ ctx = get_forward_context()
1414
+ max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
1415
+ moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
1416
+
1417
+ num_tokens = full_hidden_states.size(0)
1418
+ for chunk_start_ in range(0, max_tokens_across_dp,
1419
+ moe_dp_chunk_size_per_rank):
1420
+ chunk_start = chunk_start_
1421
+ chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
1422
+ max_tokens_across_dp)
1423
+ # clamp start and end
1424
+ chunk_start = min(chunk_start, num_tokens - 1)
1425
+ chunk_end = min(chunk_end, num_tokens)
1426
+
1427
+ process_chunk(chunk_start,
1428
+ chunk_end,
1429
+ skip_result_store=chunk_start_ >= num_tokens)
1430
+
1431
+ return full_final_hidden_states
1432
+
1433
+ def forward_impl(self, hidden_states: torch.Tensor,
1434
+ router_logits: torch.Tensor):
1435
+ assert self.quant_method is not None
1436
+ if (self.moe_parallel_config.use_pplx_kernels
1437
+ or self.moe_parallel_config.use_deepep_ll_kernels):
1438
+ return self.forward_impl_chunked(hidden_states, router_logits)
1439
+
1440
+ do_naive_dispatch_combine: bool = (
1441
+ self.dp_size > 1
1442
+ and not self.moe_parallel_config.use_deepep_ht_kernels)
1443
+ if do_naive_dispatch_combine:
1444
+ hidden_states, router_logits = get_ep_group().dispatch(
1445
+ hidden_states, router_logits)
1446
+
1447
+ # Matrix multiply.
1448
+ final_hidden_states = self.quant_method.apply(
1449
+ layer=self,
1450
+ x=hidden_states,
1451
+ router_logits=router_logits,
1452
+ top_k=self.top_k,
1453
+ renormalize=self.renormalize,
1454
+ use_grouped_topk=self.use_grouped_topk,
1455
+ global_num_experts=self.global_num_experts,
1456
+ expert_map=self.expert_map,
1457
+ topk_group=self.topk_group,
1458
+ num_expert_group=self.num_expert_group,
1459
+ custom_routing_function=self.custom_routing_function,
1460
+ scoring_func=self.scoring_func,
1461
+ e_score_correction_bias=self.e_score_correction_bias,
1462
+ activation=self.activation,
1463
+ apply_router_weight_on_input=self.apply_router_weight_on_input,
1464
+ )
1465
+
1466
+ if do_naive_dispatch_combine:
1467
+ final_hidden_states = get_ep_group().combine(final_hidden_states)
1468
+
1469
+ if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
1470
+ # Default set to False. (May have to add shared expert outputs.
1471
+ final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
1472
+ final_hidden_states)
1473
+
1474
+ return final_hidden_states
1475
+
1476
+ @classmethod
1477
+ def make_expert_params_mapping(
1478
+ cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
1479
+ ckpt_up_proj_name: str,
1480
+ num_experts: int) -> list[tuple[str, str, int, str]]:
1481
+
1482
+ return [
1483
+ # (param_name, weight_name, expert_id, shard_id)
1484
+ ("experts.w13_" if weight_name
1485
+ in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
1486
+ f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
1487
+ for expert_id in range(num_experts) for shard_id, weight_name in [
1488
+ ("w1", ckpt_gate_proj_name),
1489
+ ("w2", ckpt_down_proj_name),
1490
+ ("w3", ckpt_up_proj_name),
1491
+ ]
1492
+ ]
1493
+
1494
+ def extra_repr(self) -> str:
1495
+
1496
+ s = (
1497
+ f"global_num_experts={self.global_num_experts}, "
1498
+ f"local_num_experts={self.local_num_experts}, "
1499
+ f"top_k={self.top_k}, "
1500
+ f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501
1501
+ f"tp_size={self.tp_size},\n"
1502
+ f"ep_size={self.ep_size}, "
1503
+ f"reduce_results={self.reduce_results}, "
1504
+ f"renormalize={self.renormalize}, "
1505
+ f"use_grouped_topk={self.use_grouped_topk}")
1506
+
1507
+ if self.use_grouped_topk:
1508
+ s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}" # noqa: E501
1509
+
1510
+ s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'" # noqa: E501
1511
+
1512
+ return s
1513
+
1514
+
1515
+ def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
1516
+ layer_name: str) -> torch.Tensor:
1517
+ forward_context: ForwardContext = get_forward_context()
1518
+ self = forward_context.no_compile_layers[layer_name]
1519
+ assert self.quant_method is not None
1520
+
1521
+ return self.forward_impl(hidden_states, router_logits)
1522
+
1523
+
1524
+ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
1525
+ layer_name: str) -> torch.Tensor:
1526
+ return torch.empty_like(hidden_states)
1527
+
1528
+
1529
+ direct_register_custom_op(
1530
+ op_name="moe_forward",
1531
+ op_func=moe_forward,
1532
+ mutates_args=[],
1533
+ fake_impl=moe_forward_fake,
1534
+ dispatch_key=current_platform.dispatch_key,
1535
+ )