vllm-cpu 0.9.2.post2__cp311-cp311-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1236) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +214 -0
  3. vllm/_custom_ops.py +1915 -0
  4. vllm/_ipex_ops.py +350 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +139 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +325 -0
  20. vllm/attention/backends/blocksparse_attn.py +465 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1506 -0
  23. vllm/attention/backends/flash_attn.py +1008 -0
  24. vllm/attention/backends/flashinfer.py +1107 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +318 -0
  27. vllm/attention/backends/ipex_attn.py +403 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1391 -0
  30. vllm/attention/backends/pallas.py +356 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +1015 -0
  34. vllm/attention/backends/torch_sdpa.py +707 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +807 -0
  38. vllm/attention/layer.py +481 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +903 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/pallas_kv_cache_update.py +120 -0
  52. vllm/attention/ops/prefix_prefill.py +902 -0
  53. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  54. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  55. vllm/attention/ops/triton_decode_attention.py +674 -0
  56. vllm/attention/ops/triton_flash_attention.py +984 -0
  57. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  58. vllm/attention/ops/triton_unified_attention.py +738 -0
  59. vllm/attention/selector.py +214 -0
  60. vllm/attention/utils/fa_utils.py +72 -0
  61. vllm/beam_search.py +87 -0
  62. vllm/benchmarks/__init__.py +0 -0
  63. vllm/benchmarks/datasets.py +1441 -0
  64. vllm/benchmarks/endpoint_request_func.py +393 -0
  65. vllm/benchmarks/latency.py +168 -0
  66. vllm/benchmarks/serve.py +1063 -0
  67. vllm/benchmarks/throughput.py +609 -0
  68. vllm/benchmarks/utils.py +70 -0
  69. vllm/collect_env.py +820 -0
  70. vllm/compilation/__init__.py +0 -0
  71. vllm/compilation/activation_quant_fusion.py +89 -0
  72. vllm/compilation/backends.py +610 -0
  73. vllm/compilation/base_piecewise_backend.py +72 -0
  74. vllm/compilation/collective_fusion.py +127 -0
  75. vllm/compilation/compiler_interface.py +564 -0
  76. vllm/compilation/counter.py +41 -0
  77. vllm/compilation/cuda_piecewise_backend.py +218 -0
  78. vllm/compilation/decorators.py +250 -0
  79. vllm/compilation/fix_functionalization.py +191 -0
  80. vllm/compilation/fusion.py +645 -0
  81. vllm/compilation/fusion_attn.py +166 -0
  82. vllm/compilation/fx_utils.py +84 -0
  83. vllm/compilation/inductor_pass.py +115 -0
  84. vllm/compilation/monitor.py +39 -0
  85. vllm/compilation/multi_output_match.py +109 -0
  86. vllm/compilation/noop_elimination.py +165 -0
  87. vllm/compilation/pass_manager.py +82 -0
  88. vllm/compilation/sequence_parallelism.py +482 -0
  89. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  90. vllm/compilation/vllm_inductor_pass.py +70 -0
  91. vllm/compilation/wrapper.py +135 -0
  92. vllm/config.py +4913 -0
  93. vllm/connections.py +174 -0
  94. vllm/core/__init__.py +0 -0
  95. vllm/core/block/__init__.py +0 -0
  96. vllm/core/block/block_table.py +399 -0
  97. vllm/core/block/common.py +371 -0
  98. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  99. vllm/core/block/interfaces.py +319 -0
  100. vllm/core/block/naive_block.py +466 -0
  101. vllm/core/block/prefix_caching_block.py +1135 -0
  102. vllm/core/block/utils.py +28 -0
  103. vllm/core/block_manager.py +525 -0
  104. vllm/core/evictor.py +157 -0
  105. vllm/core/interfaces.py +139 -0
  106. vllm/core/placeholder_block_space_manager.py +103 -0
  107. vllm/core/scheduler.py +2126 -0
  108. vllm/device_allocator/__init__.py +0 -0
  109. vllm/device_allocator/cumem.py +281 -0
  110. vllm/distributed/__init__.py +6 -0
  111. vllm/distributed/communication_op.py +41 -0
  112. vllm/distributed/device_communicators/__init__.py +0 -0
  113. vllm/distributed/device_communicators/all2all.py +264 -0
  114. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  115. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  116. vllm/distributed/device_communicators/cuda_communicator.py +194 -0
  117. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  118. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  119. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  120. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  121. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  122. vllm/distributed/device_communicators/pynccl.py +218 -0
  123. vllm/distributed/device_communicators/pynccl_wrapper.py +349 -0
  124. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  125. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  126. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  127. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  128. vllm/distributed/eplb/__init__.py +8 -0
  129. vllm/distributed/eplb/eplb_state.py +432 -0
  130. vllm/distributed/eplb/rebalance_algo.py +234 -0
  131. vllm/distributed/eplb/rebalance_execute.py +307 -0
  132. vllm/distributed/kv_events.py +356 -0
  133. vllm/distributed/kv_transfer/README.md +29 -0
  134. vllm/distributed/kv_transfer/__init__.py +12 -0
  135. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  137. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  138. vllm/distributed/kv_transfer/kv_connector/factory.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  140. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  141. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  142. vllm/distributed/kv_transfer/kv_connector/utils.py +109 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1103 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +485 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +533 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +265 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +389 -0
  153. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  154. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  155. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  156. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  158. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  159. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  160. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  161. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  162. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  163. vllm/distributed/parallel_state.py +1385 -0
  164. vllm/distributed/tpu_distributed_utils.py +178 -0
  165. vllm/distributed/utils.py +536 -0
  166. vllm/engine/__init__.py +0 -0
  167. vllm/engine/arg_utils.py +1801 -0
  168. vllm/engine/async_llm_engine.py +1200 -0
  169. vllm/engine/async_timeout.py +173 -0
  170. vllm/engine/llm_engine.py +2101 -0
  171. vllm/engine/metrics.py +629 -0
  172. vllm/engine/metrics_types.py +94 -0
  173. vllm/engine/multiprocessing/__init__.py +148 -0
  174. vllm/engine/multiprocessing/client.py +681 -0
  175. vllm/engine/multiprocessing/engine.py +460 -0
  176. vllm/engine/output_processor/__init__.py +0 -0
  177. vllm/engine/output_processor/interfaces.py +75 -0
  178. vllm/engine/output_processor/multi_step.py +216 -0
  179. vllm/engine/output_processor/single_step.py +145 -0
  180. vllm/engine/output_processor/stop_checker.py +131 -0
  181. vllm/engine/output_processor/util.py +28 -0
  182. vllm/engine/protocol.py +326 -0
  183. vllm/entrypoints/__init__.py +0 -0
  184. vllm/entrypoints/api_server.py +178 -0
  185. vllm/entrypoints/chat_utils.py +1278 -0
  186. vllm/entrypoints/cli/__init__.py +12 -0
  187. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  188. vllm/entrypoints/cli/benchmark/base.py +25 -0
  189. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  190. vllm/entrypoints/cli/benchmark/main.py +58 -0
  191. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  192. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  193. vllm/entrypoints/cli/collect_env.py +36 -0
  194. vllm/entrypoints/cli/main.py +71 -0
  195. vllm/entrypoints/cli/openai.py +201 -0
  196. vllm/entrypoints/cli/run_batch.py +69 -0
  197. vllm/entrypoints/cli/serve.py +265 -0
  198. vllm/entrypoints/cli/types.py +29 -0
  199. vllm/entrypoints/launcher.py +147 -0
  200. vllm/entrypoints/llm.py +1599 -0
  201. vllm/entrypoints/logger.py +50 -0
  202. vllm/entrypoints/openai/__init__.py +0 -0
  203. vllm/entrypoints/openai/api_server.py +1495 -0
  204. vllm/entrypoints/openai/cli_args.py +331 -0
  205. vllm/entrypoints/openai/logits_processors.py +90 -0
  206. vllm/entrypoints/openai/protocol.py +2096 -0
  207. vllm/entrypoints/openai/run_batch.py +473 -0
  208. vllm/entrypoints/openai/serving_chat.py +1258 -0
  209. vllm/entrypoints/openai/serving_classification.py +160 -0
  210. vllm/entrypoints/openai/serving_completion.py +618 -0
  211. vllm/entrypoints/openai/serving_embedding.py +201 -0
  212. vllm/entrypoints/openai/serving_engine.py +988 -0
  213. vllm/entrypoints/openai/serving_models.py +315 -0
  214. vllm/entrypoints/openai/serving_pooling.py +234 -0
  215. vllm/entrypoints/openai/serving_score.py +431 -0
  216. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  217. vllm/entrypoints/openai/serving_transcription.py +132 -0
  218. vllm/entrypoints/openai/speech_to_text.py +395 -0
  219. vllm/entrypoints/openai/tool_parsers/__init__.py +25 -0
  220. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  221. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  222. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  223. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  224. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  225. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  226. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  227. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  228. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  229. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +369 -0
  230. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  231. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  232. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  233. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  234. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +466 -0
  235. vllm/entrypoints/score_utils.py +50 -0
  236. vllm/entrypoints/ssl.py +75 -0
  237. vllm/entrypoints/utils.py +262 -0
  238. vllm/env_override.py +41 -0
  239. vllm/envs.py +1029 -0
  240. vllm/executor/__init__.py +0 -0
  241. vllm/executor/executor_base.py +401 -0
  242. vllm/executor/mp_distributed_executor.py +244 -0
  243. vllm/executor/msgspec_utils.py +30 -0
  244. vllm/executor/multiproc_worker_utils.py +313 -0
  245. vllm/executor/ray_distributed_executor.py +701 -0
  246. vllm/executor/ray_utils.py +399 -0
  247. vllm/executor/uniproc_executor.py +139 -0
  248. vllm/forward_context.py +185 -0
  249. vllm/inputs/__init__.py +41 -0
  250. vllm/inputs/data.py +331 -0
  251. vllm/inputs/parse.py +151 -0
  252. vllm/inputs/preprocess.py +924 -0
  253. vllm/inputs/registry.py +245 -0
  254. vllm/jsontree.py +80 -0
  255. vllm/logger.py +212 -0
  256. vllm/logging_utils/__init__.py +8 -0
  257. vllm/logging_utils/dump_input.py +81 -0
  258. vllm/logging_utils/formatter.py +18 -0
  259. vllm/logits_process.py +119 -0
  260. vllm/lora/__init__.py +0 -0
  261. vllm/lora/fully_sharded_layers.py +355 -0
  262. vllm/lora/layers.py +1285 -0
  263. vllm/lora/lora.py +199 -0
  264. vllm/lora/models.py +818 -0
  265. vllm/lora/ops/__init__.py +0 -0
  266. vllm/lora/ops/torch_ops/__init__.py +16 -0
  267. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  268. vllm/lora/ops/triton_ops/__init__.py +12 -0
  269. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  270. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  271. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  272. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  273. vllm/lora/ops/triton_ops/utils.py +120 -0
  274. vllm/lora/ops/xla_ops/__init__.py +7 -0
  275. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  276. vllm/lora/peft_helper.py +136 -0
  277. vllm/lora/punica_wrapper/__init__.py +10 -0
  278. vllm/lora/punica_wrapper/punica_base.py +485 -0
  279. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  280. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  281. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  284. vllm/lora/punica_wrapper/utils.py +164 -0
  285. vllm/lora/request.py +99 -0
  286. vllm/lora/resolver.py +85 -0
  287. vllm/lora/utils.py +240 -0
  288. vllm/lora/worker_manager.py +256 -0
  289. vllm/model_executor/__init__.py +16 -0
  290. vllm/model_executor/custom_op.py +208 -0
  291. vllm/model_executor/guided_decoding/__init__.py +181 -0
  292. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  293. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  294. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  295. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  296. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  297. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  298. vllm/model_executor/guided_decoding/utils.py +242 -0
  299. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  300. vllm/model_executor/layers/__init__.py +0 -0
  301. vllm/model_executor/layers/activation.py +420 -0
  302. vllm/model_executor/layers/fused_moe/__init__.py +78 -0
  303. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +298 -0
  304. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +140 -0
  305. vllm/model_executor/layers/fused_moe/config.py +456 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  475. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +215 -0
  476. vllm/model_executor/layers/fused_moe/cutlass_moe.py +645 -0
  477. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +250 -0
  478. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +231 -0
  479. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +183 -0
  480. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1021 -0
  481. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +234 -0
  482. vllm/model_executor/layers/fused_moe/fused_moe.py +1734 -0
  483. vllm/model_executor/layers/fused_moe/layer.py +1528 -0
  484. vllm/model_executor/layers/fused_moe/modular_kernel.py +598 -0
  485. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +224 -0
  486. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  487. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  488. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  489. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +233 -0
  490. vllm/model_executor/layers/fused_moe/prepare_finalize.py +66 -0
  491. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +429 -0
  492. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +136 -0
  493. vllm/model_executor/layers/fused_moe/utils.py +144 -0
  494. vllm/model_executor/layers/layernorm.py +287 -0
  495. vllm/model_executor/layers/lightning_attn.py +652 -0
  496. vllm/model_executor/layers/linear.py +1547 -0
  497. vllm/model_executor/layers/logits_processor.py +197 -0
  498. vllm/model_executor/layers/mamba/__init__.py +0 -0
  499. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  500. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  501. vllm/model_executor/layers/mamba/mamba_mixer2.py +731 -0
  502. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  503. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  504. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  505. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  506. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  507. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  508. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  509. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  510. vllm/model_executor/layers/pooler.py +473 -0
  511. vllm/model_executor/layers/quantization/__init__.py +160 -0
  512. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  513. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  514. vllm/model_executor/layers/quantization/awq.py +228 -0
  515. vllm/model_executor/layers/quantization/awq_marlin.py +523 -0
  516. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  517. vllm/model_executor/layers/quantization/base_config.py +164 -0
  518. vllm/model_executor/layers/quantization/bitblas.py +462 -0
  519. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  520. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  521. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +694 -0
  522. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1613 -0
  523. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  524. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  525. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  526. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  527. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  528. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +149 -0
  529. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  530. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  531. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  532. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  533. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  534. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  535. vllm/model_executor/layers/quantization/deepgemm.py +83 -0
  536. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  537. vllm/model_executor/layers/quantization/experts_int8.py +204 -0
  538. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  539. vllm/model_executor/layers/quantization/fp8.py +950 -0
  540. vllm/model_executor/layers/quantization/gguf.py +577 -0
  541. vllm/model_executor/layers/quantization/gptq.py +278 -0
  542. vllm/model_executor/layers/quantization/gptq_bitblas.py +446 -0
  543. vllm/model_executor/layers/quantization/gptq_marlin.py +679 -0
  544. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  545. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  546. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  547. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  548. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  549. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  550. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  551. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  552. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  553. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +132 -0
  554. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  555. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  556. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  557. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  558. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  559. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  560. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  561. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  562. vllm/model_executor/layers/quantization/marlin.py +263 -0
  563. vllm/model_executor/layers/quantization/modelopt.py +747 -0
  564. vllm/model_executor/layers/quantization/moe_wna16.py +457 -0
  565. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  566. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  567. vllm/model_executor/layers/quantization/qqq.py +275 -0
  568. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  569. vllm/model_executor/layers/quantization/quark/quark.py +437 -0
  570. vllm/model_executor/layers/quantization/quark/quark_moe.py +245 -0
  571. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  572. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  573. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  574. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +157 -0
  575. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  576. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  577. vllm/model_executor/layers/quantization/rtn.py +289 -0
  578. vllm/model_executor/layers/quantization/schema.py +86 -0
  579. vllm/model_executor/layers/quantization/torchao.py +212 -0
  580. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  581. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  582. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  583. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/fp8_utils.py +653 -0
  787. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  788. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  789. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  790. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  791. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  792. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  793. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  794. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  795. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  796. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  797. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  798. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +146 -0
  799. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  800. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  801. vllm/model_executor/layers/rejection_sampler.py +406 -0
  802. vllm/model_executor/layers/resampler.py +270 -0
  803. vllm/model_executor/layers/rotary_embedding.py +2025 -0
  804. vllm/model_executor/layers/sampler.py +1204 -0
  805. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  806. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  807. vllm/model_executor/layers/utils.py +116 -0
  808. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  809. vllm/model_executor/model_loader/__init__.py +77 -0
  810. vllm/model_executor/model_loader/base_loader.py +43 -0
  811. vllm/model_executor/model_loader/bitsandbytes_loader.py +613 -0
  812. vllm/model_executor/model_loader/default_loader.py +282 -0
  813. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  814. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  815. vllm/model_executor/model_loader/neuron.py +476 -0
  816. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  817. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  818. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  819. vllm/model_executor/model_loader/tensorizer.py +602 -0
  820. vllm/model_executor/model_loader/tensorizer_loader.py +127 -0
  821. vllm/model_executor/model_loader/tpu.py +113 -0
  822. vllm/model_executor/model_loader/utils.py +315 -0
  823. vllm/model_executor/model_loader/weight_utils.py +782 -0
  824. vllm/model_executor/models/__init__.py +30 -0
  825. vllm/model_executor/models/adapters.py +375 -0
  826. vllm/model_executor/models/aimv2.py +246 -0
  827. vllm/model_executor/models/arctic.py +559 -0
  828. vllm/model_executor/models/aria.py +670 -0
  829. vllm/model_executor/models/aya_vision.py +486 -0
  830. vllm/model_executor/models/baichuan.py +474 -0
  831. vllm/model_executor/models/bamba.py +558 -0
  832. vllm/model_executor/models/bart.py +938 -0
  833. vllm/model_executor/models/bert.py +513 -0
  834. vllm/model_executor/models/bert_with_rope.py +617 -0
  835. vllm/model_executor/models/blip.py +339 -0
  836. vllm/model_executor/models/blip2.py +728 -0
  837. vllm/model_executor/models/bloom.py +373 -0
  838. vllm/model_executor/models/chameleon.py +1146 -0
  839. vllm/model_executor/models/chatglm.py +478 -0
  840. vllm/model_executor/models/clip.py +407 -0
  841. vllm/model_executor/models/commandr.py +471 -0
  842. vllm/model_executor/models/config.py +200 -0
  843. vllm/model_executor/models/constant_size_cache.py +137 -0
  844. vllm/model_executor/models/dbrx.py +472 -0
  845. vllm/model_executor/models/deepseek.py +486 -0
  846. vllm/model_executor/models/deepseek_mtp.py +281 -0
  847. vllm/model_executor/models/deepseek_v2.py +935 -0
  848. vllm/model_executor/models/deepseek_vl2.py +660 -0
  849. vllm/model_executor/models/dots1.py +536 -0
  850. vllm/model_executor/models/eagle.py +261 -0
  851. vllm/model_executor/models/ernie45.py +43 -0
  852. vllm/model_executor/models/ernie45_moe.py +583 -0
  853. vllm/model_executor/models/exaone.py +551 -0
  854. vllm/model_executor/models/fairseq2_llama.py +154 -0
  855. vllm/model_executor/models/falcon.py +510 -0
  856. vllm/model_executor/models/falcon_h1.py +708 -0
  857. vllm/model_executor/models/florence2.py +1113 -0
  858. vllm/model_executor/models/fuyu.py +406 -0
  859. vllm/model_executor/models/gemma.py +427 -0
  860. vllm/model_executor/models/gemma2.py +427 -0
  861. vllm/model_executor/models/gemma3.py +535 -0
  862. vllm/model_executor/models/gemma3_mm.py +729 -0
  863. vllm/model_executor/models/gemma3n.py +811 -0
  864. vllm/model_executor/models/glm.py +23 -0
  865. vllm/model_executor/models/glm4.py +305 -0
  866. vllm/model_executor/models/glm4_1v.py +1590 -0
  867. vllm/model_executor/models/glm4v.py +657 -0
  868. vllm/model_executor/models/gpt2.py +382 -0
  869. vllm/model_executor/models/gpt_bigcode.py +335 -0
  870. vllm/model_executor/models/gpt_j.py +339 -0
  871. vllm/model_executor/models/gpt_neox.py +332 -0
  872. vllm/model_executor/models/granite.py +493 -0
  873. vllm/model_executor/models/granite_speech.py +790 -0
  874. vllm/model_executor/models/granitemoe.py +437 -0
  875. vllm/model_executor/models/granitemoehybrid.py +653 -0
  876. vllm/model_executor/models/granitemoeshared.py +341 -0
  877. vllm/model_executor/models/gritlm.py +224 -0
  878. vllm/model_executor/models/grok1.py +546 -0
  879. vllm/model_executor/models/h2ovl.py +549 -0
  880. vllm/model_executor/models/hunyuan_v1_moe.py +897 -0
  881. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  882. vllm/model_executor/models/idefics3.py +786 -0
  883. vllm/model_executor/models/interfaces.py +681 -0
  884. vllm/model_executor/models/interfaces_base.py +164 -0
  885. vllm/model_executor/models/intern_vit.py +480 -0
  886. vllm/model_executor/models/internlm2.py +455 -0
  887. vllm/model_executor/models/internlm2_ve.py +147 -0
  888. vllm/model_executor/models/internvl.py +1432 -0
  889. vllm/model_executor/models/jais.py +373 -0
  890. vllm/model_executor/models/jamba.py +592 -0
  891. vllm/model_executor/models/keye.py +1736 -0
  892. vllm/model_executor/models/kimi_vl.py +585 -0
  893. vllm/model_executor/models/llama.py +644 -0
  894. vllm/model_executor/models/llama4.py +531 -0
  895. vllm/model_executor/models/llama_eagle.py +165 -0
  896. vllm/model_executor/models/llama_eagle3.py +263 -0
  897. vllm/model_executor/models/llava.py +887 -0
  898. vllm/model_executor/models/llava_next.py +604 -0
  899. vllm/model_executor/models/llava_next_video.py +492 -0
  900. vllm/model_executor/models/llava_onevision.py +985 -0
  901. vllm/model_executor/models/mamba.py +273 -0
  902. vllm/model_executor/models/mamba2.py +320 -0
  903. vllm/model_executor/models/mamba_cache.py +76 -0
  904. vllm/model_executor/models/medusa.py +219 -0
  905. vllm/model_executor/models/mimo.py +192 -0
  906. vllm/model_executor/models/mimo_mtp.py +285 -0
  907. vllm/model_executor/models/minicpm.py +592 -0
  908. vllm/model_executor/models/minicpm3.py +230 -0
  909. vllm/model_executor/models/minicpm_eagle.py +391 -0
  910. vllm/model_executor/models/minicpmo.py +772 -0
  911. vllm/model_executor/models/minicpmv.py +1307 -0
  912. vllm/model_executor/models/minimax_cache.py +36 -0
  913. vllm/model_executor/models/minimax_text_01.py +1301 -0
  914. vllm/model_executor/models/minimax_vl_01.py +374 -0
  915. vllm/model_executor/models/mistral3.py +624 -0
  916. vllm/model_executor/models/mixtral.py +488 -0
  917. vllm/model_executor/models/mixtral_quant.py +453 -0
  918. vllm/model_executor/models/mllama.py +1682 -0
  919. vllm/model_executor/models/mllama4.py +947 -0
  920. vllm/model_executor/models/mlp_speculator.py +206 -0
  921. vllm/model_executor/models/modernbert.py +339 -0
  922. vllm/model_executor/models/module_mapping.py +72 -0
  923. vllm/model_executor/models/molmo.py +1576 -0
  924. vllm/model_executor/models/moonvit.py +630 -0
  925. vllm/model_executor/models/mpt.py +331 -0
  926. vllm/model_executor/models/nemotron.py +508 -0
  927. vllm/model_executor/models/nemotron_h.py +588 -0
  928. vllm/model_executor/models/nemotron_nas.py +484 -0
  929. vllm/model_executor/models/nvlm_d.py +216 -0
  930. vllm/model_executor/models/olmo.py +389 -0
  931. vllm/model_executor/models/olmo2.py +414 -0
  932. vllm/model_executor/models/olmoe.py +468 -0
  933. vllm/model_executor/models/opt.py +412 -0
  934. vllm/model_executor/models/orion.py +349 -0
  935. vllm/model_executor/models/ovis.py +577 -0
  936. vllm/model_executor/models/paligemma.py +419 -0
  937. vllm/model_executor/models/persimmon.py +344 -0
  938. vllm/model_executor/models/phi.py +356 -0
  939. vllm/model_executor/models/phi3.py +19 -0
  940. vllm/model_executor/models/phi3_small.py +465 -0
  941. vllm/model_executor/models/phi3v.py +733 -0
  942. vllm/model_executor/models/phi4mm.py +1258 -0
  943. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  944. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  945. vllm/model_executor/models/phimoe.py +674 -0
  946. vllm/model_executor/models/pixtral.py +1329 -0
  947. vllm/model_executor/models/plamo2.py +738 -0
  948. vllm/model_executor/models/prithvi_geospatial_mae.py +240 -0
  949. vllm/model_executor/models/qwen.py +362 -0
  950. vllm/model_executor/models/qwen2.py +501 -0
  951. vllm/model_executor/models/qwen2_5_omni_thinker.py +923 -0
  952. vllm/model_executor/models/qwen2_5_vl.py +1175 -0
  953. vllm/model_executor/models/qwen2_audio.py +420 -0
  954. vllm/model_executor/models/qwen2_moe.py +540 -0
  955. vllm/model_executor/models/qwen2_rm.py +122 -0
  956. vllm/model_executor/models/qwen2_vl.py +1513 -0
  957. vllm/model_executor/models/qwen3.py +325 -0
  958. vllm/model_executor/models/qwen3_moe.py +541 -0
  959. vllm/model_executor/models/qwen_vl.py +796 -0
  960. vllm/model_executor/models/registry.py +634 -0
  961. vllm/model_executor/models/roberta.py +271 -0
  962. vllm/model_executor/models/siglip.py +524 -0
  963. vllm/model_executor/models/skyworkr1v.py +961 -0
  964. vllm/model_executor/models/smolvlm.py +52 -0
  965. vllm/model_executor/models/solar.py +506 -0
  966. vllm/model_executor/models/stablelm.py +343 -0
  967. vllm/model_executor/models/starcoder2.py +356 -0
  968. vllm/model_executor/models/tarsier.py +652 -0
  969. vllm/model_executor/models/telechat2.py +140 -0
  970. vllm/model_executor/models/teleflm.py +79 -0
  971. vllm/model_executor/models/transformers.py +509 -0
  972. vllm/model_executor/models/ultravox.py +670 -0
  973. vllm/model_executor/models/utils.py +744 -0
  974. vllm/model_executor/models/vision.py +147 -0
  975. vllm/model_executor/models/whisper.py +886 -0
  976. vllm/model_executor/models/zamba2.py +1036 -0
  977. vllm/model_executor/parameter.py +459 -0
  978. vllm/model_executor/pooling_metadata.py +72 -0
  979. vllm/model_executor/sampling_metadata.py +597 -0
  980. vllm/model_executor/utils.py +80 -0
  981. vllm/multimodal/__init__.py +33 -0
  982. vllm/multimodal/audio.py +116 -0
  983. vllm/multimodal/base.py +219 -0
  984. vllm/multimodal/hasher.py +91 -0
  985. vllm/multimodal/image.py +103 -0
  986. vllm/multimodal/inputs.py +878 -0
  987. vllm/multimodal/parse.py +499 -0
  988. vllm/multimodal/processing.py +1948 -0
  989. vllm/multimodal/profiling.py +283 -0
  990. vllm/multimodal/registry.py +331 -0
  991. vllm/multimodal/utils.py +492 -0
  992. vllm/multimodal/video.py +227 -0
  993. vllm/outputs.py +516 -0
  994. vllm/platforms/__init__.py +291 -0
  995. vllm/platforms/cpu.py +281 -0
  996. vllm/platforms/cuda.py +568 -0
  997. vllm/platforms/hpu.py +106 -0
  998. vllm/platforms/interface.py +551 -0
  999. vllm/platforms/neuron.py +150 -0
  1000. vllm/platforms/rocm.py +453 -0
  1001. vllm/platforms/tpu.py +206 -0
  1002. vllm/platforms/xpu.py +192 -0
  1003. vllm/plugins/__init__.py +94 -0
  1004. vllm/plugins/lora_resolvers/README.md +15 -0
  1005. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1006. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1007. vllm/pooling_params.py +64 -0
  1008. vllm/profiler/__init__.py +0 -0
  1009. vllm/profiler/layerwise_profile.py +375 -0
  1010. vllm/profiler/utils.py +148 -0
  1011. vllm/prompt_adapter/__init__.py +0 -0
  1012. vllm/prompt_adapter/layers.py +83 -0
  1013. vllm/prompt_adapter/models.py +358 -0
  1014. vllm/prompt_adapter/request.py +37 -0
  1015. vllm/prompt_adapter/utils.py +98 -0
  1016. vllm/prompt_adapter/worker_manager.py +179 -0
  1017. vllm/py.typed +2 -0
  1018. vllm/reasoning/__init__.py +15 -0
  1019. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  1020. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1021. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1022. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1023. vllm/sampling_params.py +602 -0
  1024. vllm/scalar_type.py +347 -0
  1025. vllm/scripts.py +15 -0
  1026. vllm/sequence.py +1568 -0
  1027. vllm/spec_decode/__init__.py +0 -0
  1028. vllm/spec_decode/batch_expansion.py +506 -0
  1029. vllm/spec_decode/draft_model_runner.py +349 -0
  1030. vllm/spec_decode/interfaces.py +99 -0
  1031. vllm/spec_decode/medusa_worker.py +138 -0
  1032. vllm/spec_decode/metrics.py +213 -0
  1033. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1034. vllm/spec_decode/mqa_scorer.py +160 -0
  1035. vllm/spec_decode/multi_step_worker.py +423 -0
  1036. vllm/spec_decode/ngram_worker.py +196 -0
  1037. vllm/spec_decode/proposer_worker_base.py +59 -0
  1038. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1039. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1040. vllm/spec_decode/target_model_runner.py +45 -0
  1041. vllm/spec_decode/top1_proposer.py +275 -0
  1042. vllm/spec_decode/util.py +277 -0
  1043. vllm/test_utils.py +130 -0
  1044. vllm/third_party/__init__.py +0 -0
  1045. vllm/third_party/pynvml.py +6140 -0
  1046. vllm/tracing.py +131 -0
  1047. vllm/transformers_utils/__init__.py +24 -0
  1048. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1049. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1050. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1051. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1052. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1053. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1054. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1055. vllm/transformers_utils/config.py +922 -0
  1056. vllm/transformers_utils/configs/__init__.py +57 -0
  1057. vllm/transformers_utils/configs/arctic.py +207 -0
  1058. vllm/transformers_utils/configs/chatglm.py +72 -0
  1059. vllm/transformers_utils/configs/cohere2.py +195 -0
  1060. vllm/transformers_utils/configs/dbrx.py +280 -0
  1061. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1062. vllm/transformers_utils/configs/eagle.py +85 -0
  1063. vllm/transformers_utils/configs/exaone.py +190 -0
  1064. vllm/transformers_utils/configs/falcon.py +90 -0
  1065. vllm/transformers_utils/configs/jais.py +238 -0
  1066. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1067. vllm/transformers_utils/configs/medusa.py +63 -0
  1068. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1069. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1070. vllm/transformers_utils/configs/mllama.py +31 -0
  1071. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1072. vllm/transformers_utils/configs/moonvit.py +33 -0
  1073. vllm/transformers_utils/configs/mpt.py +180 -0
  1074. vllm/transformers_utils/configs/nemotron.py +205 -0
  1075. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1076. vllm/transformers_utils/configs/nvlm_d.py +31 -0
  1077. vllm/transformers_utils/configs/ovis.py +184 -0
  1078. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1079. vllm/transformers_utils/configs/solar.py +247 -0
  1080. vllm/transformers_utils/configs/telechat2.py +64 -0
  1081. vllm/transformers_utils/configs/ultravox.py +108 -0
  1082. vllm/transformers_utils/detokenizer.py +168 -0
  1083. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1084. vllm/transformers_utils/processor.py +221 -0
  1085. vllm/transformers_utils/processors/__init__.py +8 -0
  1086. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1087. vllm/transformers_utils/processors/ovis.py +420 -0
  1088. vllm/transformers_utils/s3_utils.py +162 -0
  1089. vllm/transformers_utils/tokenizer.py +302 -0
  1090. vllm/transformers_utils/tokenizer_base.py +149 -0
  1091. vllm/transformers_utils/tokenizer_group.py +120 -0
  1092. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1093. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1094. vllm/transformers_utils/utils.py +99 -0
  1095. vllm/triton_utils/__init__.py +14 -0
  1096. vllm/triton_utils/importing.py +94 -0
  1097. vllm/usage/__init__.py +0 -0
  1098. vllm/usage/usage_lib.py +259 -0
  1099. vllm/utils/__init__.py +3008 -0
  1100. vllm/v1/__init__.py +0 -0
  1101. vllm/v1/attention/__init__.py +0 -0
  1102. vllm/v1/attention/backends/__init__.py +0 -0
  1103. vllm/v1/attention/backends/cpu_attn.py +184 -0
  1104. vllm/v1/attention/backends/flash_attn.py +757 -0
  1105. vllm/v1/attention/backends/flashinfer.py +680 -0
  1106. vllm/v1/attention/backends/flex_attention.py +491 -0
  1107. vllm/v1/attention/backends/mamba_attn.py +192 -0
  1108. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1109. vllm/v1/attention/backends/mla/common.py +978 -0
  1110. vllm/v1/attention/backends/mla/cutlass_mla.py +98 -0
  1111. vllm/v1/attention/backends/mla/flashmla.py +180 -0
  1112. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +241 -0
  1113. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1114. vllm/v1/attention/backends/pallas.py +320 -0
  1115. vllm/v1/attention/backends/rocm_aiter_fa.py +609 -0
  1116. vllm/v1/attention/backends/triton_attn.py +449 -0
  1117. vllm/v1/attention/backends/utils.py +310 -0
  1118. vllm/v1/core/__init__.py +0 -0
  1119. vllm/v1/core/block_pool.py +349 -0
  1120. vllm/v1/core/encoder_cache_manager.py +254 -0
  1121. vllm/v1/core/kv_cache_coordinator.py +369 -0
  1122. vllm/v1/core/kv_cache_manager.py +398 -0
  1123. vllm/v1/core/kv_cache_utils.py +999 -0
  1124. vllm/v1/core/sched/__init__.py +0 -0
  1125. vllm/v1/core/sched/interface.py +150 -0
  1126. vllm/v1/core/sched/output.py +157 -0
  1127. vllm/v1/core/sched/request_queue.py +224 -0
  1128. vllm/v1/core/sched/scheduler.py +1115 -0
  1129. vllm/v1/core/sched/utils.py +36 -0
  1130. vllm/v1/core/single_type_kv_cache_manager.py +444 -0
  1131. vllm/v1/engine/__init__.py +179 -0
  1132. vllm/v1/engine/async_llm.py +626 -0
  1133. vllm/v1/engine/coordinator.py +278 -0
  1134. vllm/v1/engine/core.py +1046 -0
  1135. vllm/v1/engine/core_client.py +1049 -0
  1136. vllm/v1/engine/detokenizer.py +292 -0
  1137. vllm/v1/engine/exceptions.py +17 -0
  1138. vllm/v1/engine/llm_engine.py +322 -0
  1139. vllm/v1/engine/logprobs.py +200 -0
  1140. vllm/v1/engine/mm_input_cache.py +91 -0
  1141. vllm/v1/engine/output_processor.py +477 -0
  1142. vllm/v1/engine/parallel_sampling.py +133 -0
  1143. vllm/v1/engine/processor.py +422 -0
  1144. vllm/v1/engine/utils.py +546 -0
  1145. vllm/v1/executor/__init__.py +0 -0
  1146. vllm/v1/executor/abstract.py +113 -0
  1147. vllm/v1/executor/multiproc_executor.py +532 -0
  1148. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1149. vllm/v1/kv_cache_interface.py +223 -0
  1150. vllm/v1/metrics/__init__.py +0 -0
  1151. vllm/v1/metrics/loggers.py +557 -0
  1152. vllm/v1/metrics/prometheus.py +82 -0
  1153. vllm/v1/metrics/ray_wrappers.py +131 -0
  1154. vllm/v1/metrics/reader.py +246 -0
  1155. vllm/v1/metrics/stats.py +240 -0
  1156. vllm/v1/outputs.py +124 -0
  1157. vllm/v1/pool/__init__.py +0 -0
  1158. vllm/v1/pool/metadata.py +17 -0
  1159. vllm/v1/request.py +229 -0
  1160. vllm/v1/sample/__init__.py +0 -0
  1161. vllm/v1/sample/logits_processor.py +517 -0
  1162. vllm/v1/sample/metadata.py +43 -0
  1163. vllm/v1/sample/ops/__init__.py +0 -0
  1164. vllm/v1/sample/ops/bad_words.py +39 -0
  1165. vllm/v1/sample/ops/penalties.py +43 -0
  1166. vllm/v1/sample/ops/topk_topp_sampler.py +296 -0
  1167. vllm/v1/sample/rejection_sampler.py +631 -0
  1168. vllm/v1/sample/sampler.py +226 -0
  1169. vllm/v1/sample/tpu/__init__.py +0 -0
  1170. vllm/v1/sample/tpu/metadata.py +124 -0
  1171. vllm/v1/sample/tpu/sampler.py +145 -0
  1172. vllm/v1/serial_utils.py +315 -0
  1173. vllm/v1/spec_decode/__init__.py +0 -0
  1174. vllm/v1/spec_decode/eagle.py +441 -0
  1175. vllm/v1/spec_decode/medusa.py +64 -0
  1176. vllm/v1/spec_decode/metadata.py +62 -0
  1177. vllm/v1/spec_decode/metrics.py +178 -0
  1178. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1179. vllm/v1/spec_decode/utils.py +41 -0
  1180. vllm/v1/structured_output/__init__.py +227 -0
  1181. vllm/v1/structured_output/backend_guidance.py +245 -0
  1182. vllm/v1/structured_output/backend_types.py +134 -0
  1183. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1184. vllm/v1/structured_output/request.py +86 -0
  1185. vllm/v1/structured_output/utils.py +175 -0
  1186. vllm/v1/utils.py +377 -0
  1187. vllm/v1/worker/__init__.py +0 -0
  1188. vllm/v1/worker/block_table.py +142 -0
  1189. vllm/v1/worker/cpu_model_runner.py +91 -0
  1190. vllm/v1/worker/cpu_worker.py +153 -0
  1191. vllm/v1/worker/gpu_input_batch.py +757 -0
  1192. vllm/v1/worker/gpu_model_runner.py +2739 -0
  1193. vllm/v1/worker/gpu_worker.py +408 -0
  1194. vllm/v1/worker/lora_model_runner_mixin.py +177 -0
  1195. vllm/v1/worker/tpu_input_batch.py +585 -0
  1196. vllm/v1/worker/tpu_model_runner.py +1849 -0
  1197. vllm/v1/worker/tpu_worker.py +315 -0
  1198. vllm/v1/worker/utils.py +112 -0
  1199. vllm/v1/worker/worker_base.py +65 -0
  1200. vllm/v1/worker/xpu_model_runner.py +33 -0
  1201. vllm/v1/worker/xpu_worker.py +165 -0
  1202. vllm/version.py +41 -0
  1203. vllm/vllm_flash_attn/.gitkeep +0 -0
  1204. vllm/worker/__init__.py +0 -0
  1205. vllm/worker/cache_engine.py +145 -0
  1206. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1207. vllm/worker/cpu_model_runner.py +671 -0
  1208. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1209. vllm/worker/cpu_worker.py +452 -0
  1210. vllm/worker/enc_dec_model_runner.py +555 -0
  1211. vllm/worker/hpu_model_runner.py +2320 -0
  1212. vllm/worker/hpu_worker.py +484 -0
  1213. vllm/worker/model_runner.py +2178 -0
  1214. vllm/worker/model_runner_base.py +282 -0
  1215. vllm/worker/multi_step_hpu_worker.py +123 -0
  1216. vllm/worker/multi_step_model_runner.py +911 -0
  1217. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1218. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1219. vllm/worker/multi_step_tpu_worker.py +108 -0
  1220. vllm/worker/multi_step_worker.py +197 -0
  1221. vllm/worker/neuron_model_runner.py +460 -0
  1222. vllm/worker/neuron_worker.py +193 -0
  1223. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1224. vllm/worker/pooling_model_runner.py +211 -0
  1225. vllm/worker/tpu_model_runner.py +909 -0
  1226. vllm/worker/tpu_worker.py +337 -0
  1227. vllm/worker/utils.py +53 -0
  1228. vllm/worker/worker.py +577 -0
  1229. vllm/worker/worker_base.py +646 -0
  1230. vllm/worker/xpu_model_runner.py +606 -0
  1231. vllm/worker/xpu_worker.py +186 -0
  1232. vllm_cpu-0.9.2.post2.dist-info/METADATA +339 -0
  1233. vllm_cpu-0.9.2.post2.dist-info/RECORD +1236 -0
  1234. vllm_cpu-0.9.2.post2.dist-info/WHEEL +5 -0
  1235. vllm_cpu-0.9.2.post2.dist-info/entry_points.txt +5 -0
  1236. vllm_cpu-0.9.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1528 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ from abc import abstractmethod
5
+ from collections.abc import Iterable
6
+ from enum import Enum
7
+ from typing import Callable, Literal, Optional, overload
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torch.nn.parameter import UninitializedParameter
12
+
13
+ import vllm.envs as envs
14
+ from vllm.config import get_current_vllm_config
15
+ from vllm.distributed import (get_dp_group, get_ep_group,
16
+ get_tensor_model_parallel_world_size,
17
+ tensor_model_parallel_all_reduce)
18
+ from vllm.distributed.eplb.eplb_state import EplbState
19
+ from vllm.forward_context import ForwardContext, get_forward_context
20
+ from vllm.logger import init_logger
21
+ from vllm.model_executor.custom_op import CustomOp
22
+ # yapf: disable
23
+ from vllm.model_executor.layers.fused_moe.config import (
24
+ FusedMoEConfig, FusedMoEParallelConfig)
25
+ # yapf: enable
26
+ from vllm.model_executor.layers.fused_moe.modular_kernel import (
27
+ FusedMoEActivationFormat, FusedMoEModularKernel,
28
+ FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
29
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
30
+ is_rocm_aiter_moe_enabled)
31
+ from vllm.model_executor.layers.quantization.base_config import (
32
+ QuantizationConfig, QuantizeMethodBase)
33
+ from vllm.model_executor.utils import set_weight_attrs
34
+ from vllm.platforms import current_platform
35
+ from vllm.platforms.interface import CpuArchEnum
36
+ from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx
37
+
38
+ if current_platform.is_cuda_alike():
39
+ from .fused_batched_moe import BatchedTritonExperts
40
+ from .fused_moe import TritonExperts, fused_experts
41
+ if has_pplx():
42
+ from .pplx_prepare_finalize import (PplxPrepareAndFinalize,
43
+ pplx_hidden_dim_scale_bytes)
44
+ if has_deep_ep():
45
+ from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
46
+ from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
47
+ DeepEPLLPrepareAndFinalize)
48
+ else:
49
+ fused_experts = None # type: ignore
50
+ FusedMoEPermuteExpertsUnpermute = None # type: ignore
51
+ FusedMoEPrepareAndFinalize = None # type: ignore
52
+ if is_rocm_aiter_moe_enabled():
53
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
54
+ rocm_aiter_grouped_topk as grouped_topk)
55
+ elif current_platform.is_cpu():
56
+ pass
57
+ else:
58
+ from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
59
+ if current_platform.is_tpu():
60
+ from .moe_pallas import fused_moe as fused_moe_pallas
61
+ else:
62
+ fused_moe_pallas = None # type: ignore
63
+
64
+ logger = init_logger(__name__)
65
+
66
+
67
+ class FusedMoeWeightScaleSupported(Enum):
68
+ TENSOR = "tensor"
69
+ CHANNEL = "channel"
70
+ GROUP = "group"
71
+ BLOCK = "block"
72
+
73
+
74
+ class FusedMoEMethodBase(QuantizeMethodBase):
75
+
76
+ moe: FusedMoEConfig
77
+
78
+ @abstractmethod
79
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
80
+ hidden_size: int, intermediate_size_per_partition: int,
81
+ params_dtype: torch.dtype, **extra_weight_attrs):
82
+ raise NotImplementedError
83
+
84
+ def init_prepare_finalize(self, moe: FusedMoEConfig,
85
+ quant_config: Optional[QuantizationConfig]):
86
+ all2all_manager = get_ep_group().device_communicator.all2all_manager
87
+ assert all2all_manager is not None
88
+
89
+ self.moe = moe
90
+
91
+ prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
92
+
93
+ if moe.use_pplx_kernels:
94
+ hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
95
+ moe.max_num_tokens,
96
+ moe.hidden_dim,
97
+ moe.in_dtype,
98
+ moe.quant_dtype,
99
+ per_act_token_quant=moe.per_act_token_quant,
100
+ block_shape=moe.block_shape,
101
+ )
102
+
103
+ all_to_all_args = dict(
104
+ max_num_tokens=moe.max_num_tokens,
105
+ num_experts=moe.num_experts,
106
+ experts_per_token=moe.experts_per_token, # topk
107
+ rank=all2all_manager.rank,
108
+ world_size=all2all_manager.world_size,
109
+ # dp_size actually means tp_size, bug in pplx kernels
110
+ dp_size=all2all_manager.tp_group.world_size,
111
+ hidden_dim=moe.hidden_dim,
112
+ hidden_dim_bytes=hidden_dim_bytes,
113
+ hidden_dim_scale_bytes=hidden_scale_bytes,
114
+ )
115
+
116
+ num_dispatchers = (all2all_manager.world_size //
117
+ all2all_manager.tp_group.world_size)
118
+
119
+ # Intranode pplx a2a takes a group name while internode does not.
120
+ if not all2all_manager.internode:
121
+ all_to_all_args[
122
+ "group_name"] = all2all_manager.cpu_group.group_name
123
+
124
+ handle = all2all_manager.get_handle(all_to_all_args)
125
+
126
+ prepare_finalize = PplxPrepareAndFinalize(
127
+ handle,
128
+ max_num_tokens=moe.max_num_tokens,
129
+ num_local_experts=moe.num_local_experts,
130
+ num_dispatchers=num_dispatchers,
131
+ )
132
+ elif moe.use_deepep_ht_kernels:
133
+ assert moe.dp_size == all2all_manager.dp_world_size
134
+
135
+ all_to_all_args = dict()
136
+ handle = all2all_manager.get_handle(all_to_all_args)
137
+ prepare_finalize = DeepEPHTPrepareAndFinalize(
138
+ handle,
139
+ num_dispatchers=all2all_manager.world_size,
140
+ dp_size=all2all_manager.dp_world_size,
141
+ rank_expert_offset=all2all_manager.rank *
142
+ moe.num_local_experts,
143
+ )
144
+
145
+ elif moe.use_deepep_ll_kernels:
146
+ all_to_all_args = dict(
147
+ max_num_tokens_per_dp_rank=moe.max_num_tokens,
148
+ token_hidden_size=moe.hidden_dim,
149
+ num_ep_ranks=all2all_manager.world_size,
150
+ num_global_experts=moe.num_experts,
151
+ num_local_experts=moe.num_experts //
152
+ all2all_manager.world_size)
153
+ handle = all2all_manager.get_handle(all_to_all_args)
154
+
155
+ # Note : We may want to use FP8 dispatch even otherwise just to
156
+ # reduce datamovement
157
+ use_fp8_dispatch = (moe.quant_config is not None
158
+ and moe.quant_config.quant_dtype
159
+ == current_platform.fp8_dtype()
160
+ and moe.quant_config.block_shape
161
+ == DEEPEP_QUANT_BLOCK_SHAPE)
162
+
163
+ # Note (varun): Whether to use FP8 dispatch or not needs some
164
+ # profiling. Turning it off for now.
165
+ prepare_finalize = DeepEPLLPrepareAndFinalize(
166
+ handle,
167
+ max_tokens_per_rank=moe.max_num_tokens,
168
+ num_dispatchers=all2all_manager.world_size,
169
+ use_fp8_dispatch=use_fp8_dispatch,
170
+ )
171
+
172
+ self.topk_indices_dtype = None
173
+ if prepare_finalize is not None:
174
+ logger.debug("%s", prepare_finalize.__class__.__name__)
175
+ self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
176
+ experts = self.select_gemm_impl(prepare_finalize, moe)
177
+ self.fused_experts = FusedMoEModularKernel(
178
+ prepare_finalize,
179
+ experts,
180
+ )
181
+
182
+ def select_gemm_impl(
183
+ self,
184
+ prepare_finalize: FusedMoEPrepareAndFinalize,
185
+ moe: FusedMoEConfig,
186
+ ) -> FusedMoEPermuteExpertsUnpermute:
187
+ # based on the all2all implementation, select the appropriate
188
+ # gemm implementation
189
+ raise NotImplementedError(
190
+ f"{self.__class__.__name__} must select appropriate gemm "
191
+ "implementation based on the prepare_finalize")
192
+
193
+ @abstractmethod
194
+ def apply(
195
+ self,
196
+ layer: torch.nn.Module,
197
+ x: torch.Tensor,
198
+ router_logits: torch.Tensor,
199
+ top_k: int,
200
+ renormalize: bool,
201
+ use_grouped_topk: bool = False,
202
+ topk_group: Optional[int] = None,
203
+ num_expert_group: Optional[int] = None,
204
+ global_num_experts: int = -1,
205
+ expert_map: Optional[torch.Tensor] = None,
206
+ custom_routing_function: Optional[Callable] = None,
207
+ scoring_func: str = "softmax",
208
+ e_score_correction_bias: Optional[torch.Tensor] = None,
209
+ apply_router_weight_on_input: bool = False,
210
+ activation: str = "silu",
211
+ enable_eplb: bool = False,
212
+ expert_load_view: Optional[torch.Tensor] = None,
213
+ logical_to_physical_map: Optional[torch.Tensor] = None,
214
+ logical_replica_count: Optional[torch.Tensor] = None,
215
+ ) -> torch.Tensor:
216
+ raise NotImplementedError
217
+
218
+
219
+ @CustomOp.register("unquantized_fused_moe")
220
+ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
221
+ """MoE method without quantization."""
222
+
223
+ def __init__(self, moe: FusedMoEConfig):
224
+ super().__init__()
225
+ self.fused_experts = fused_experts # type: ignore
226
+ self.topk_indices_dtype = None
227
+ self.moe = moe
228
+
229
+ self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
230
+ if self.rocm_aiter_moe_enabled:
231
+ from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
232
+ self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
233
+ else:
234
+ self.rocm_aiter_fused_experts = None # type: ignore
235
+
236
+ def select_gemm_impl(
237
+ self,
238
+ prepare_finalize: FusedMoEPrepareAndFinalize,
239
+ moe: FusedMoEConfig,
240
+ ) -> FusedMoEPermuteExpertsUnpermute:
241
+
242
+ assert self.fused_experts == fused_experts
243
+
244
+ if (prepare_finalize.activation_format ==
245
+ FusedMoEActivationFormat.BatchedExperts):
246
+ logger.debug("BatchedTritonExperts %s", self.moe)
247
+ return BatchedTritonExperts(
248
+ max_num_tokens=self.moe.max_num_tokens,
249
+ num_dispatchers=prepare_finalize.num_dispatchers(),
250
+ )
251
+ else:
252
+ logger.debug("TritonExperts %s", self.moe)
253
+ return TritonExperts()
254
+
255
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
256
+ hidden_size: int, intermediate_size_per_partition: int,
257
+ params_dtype: torch.dtype, **extra_weight_attrs):
258
+ # Fused gate_up_proj (column parallel)
259
+ w13_weight = torch.nn.Parameter(torch.empty(
260
+ num_experts,
261
+ 2 * intermediate_size_per_partition,
262
+ hidden_size,
263
+ dtype=params_dtype),
264
+ requires_grad=False)
265
+ layer.register_parameter("w13_weight", w13_weight)
266
+ set_weight_attrs(w13_weight, extra_weight_attrs)
267
+
268
+ # down_proj (row parallel)
269
+ w2_weight = torch.nn.Parameter(torch.empty(
270
+ num_experts,
271
+ hidden_size,
272
+ intermediate_size_per_partition,
273
+ dtype=params_dtype),
274
+ requires_grad=False)
275
+ layer.register_parameter("w2_weight", w2_weight)
276
+ set_weight_attrs(w2_weight, extra_weight_attrs)
277
+
278
+ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
279
+ # Pad the weight tensor. This is an optimization on ROCm platform, which
280
+ # can benefit from tensors located far enough from one another in memory
281
+ if (envs.VLLM_ROCM_MOE_PADDING and current_platform.is_rocm()
282
+ and weight.stride(-1) == 1
283
+ and (weight.stride(-2) * weight.element_size()) % 512 == 0):
284
+ num_pad = 256 // weight.element_size()
285
+ weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
286
+ torch.cuda.empty_cache()
287
+ return weight
288
+
289
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
290
+ super().process_weights_after_loading(layer)
291
+
292
+ # Padding the weight for better performance on ROCm
293
+ layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
294
+ layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
295
+ # Lazy import to avoid importing triton.
296
+ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
297
+ shuffle_weights)
298
+
299
+ if self.rocm_aiter_moe_enabled:
300
+ shuffled_w13, shuffled_w2 = shuffle_weights(
301
+ layer.w13_weight.data, layer.w2_weight.data)
302
+
303
+ layer.w13_weight.data = shuffled_w13
304
+ layer.w2_weight.data = shuffled_w2
305
+
306
+ if current_platform.is_cpu():
307
+ if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
308
+ from vllm.model_executor.layers.fused_moe import cpu_fused_moe
309
+ dtype = layer.w13_weight.dtype
310
+ if (envs.VLLM_CPU_SGL_KERNEL
311
+ and torch._C._cpu._is_amx_tile_supported()
312
+ and dtype == torch.bfloat16):
313
+ packed_w13_weight = torch.ops._C.convert_weight_packed(
314
+ layer.w13_weight)
315
+ assert packed_w13_weight.size() == layer.w13_weight.size()
316
+ layer.w13_weight.copy_(packed_w13_weight)
317
+ del packed_w13_weight
318
+ packed_w2_weight = torch.ops._C.convert_weight_packed(
319
+ layer.w2_weight)
320
+ assert packed_w2_weight.size() == layer.w2_weight.size()
321
+ layer.w2_weight.copy_(packed_w2_weight)
322
+ layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
323
+ else:
324
+ layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
325
+ else:
326
+ raise NotImplementedError("CPU MOE only supports x86 arch.")
327
+
328
+ def apply(
329
+ self,
330
+ layer: torch.nn.Module,
331
+ x: torch.Tensor,
332
+ router_logits: torch.Tensor,
333
+ top_k: int,
334
+ renormalize: bool,
335
+ use_grouped_topk: bool = False,
336
+ topk_group: Optional[int] = None,
337
+ num_expert_group: Optional[int] = None,
338
+ global_num_experts: int = -1,
339
+ expert_map: Optional[torch.Tensor] = None,
340
+ custom_routing_function: Optional[Callable] = None,
341
+ scoring_func: str = "softmax",
342
+ e_score_correction_bias: Optional[torch.Tensor] = None,
343
+ apply_router_weight_on_input: bool = False,
344
+ activation: str = "silu",
345
+ enable_eplb: bool = False,
346
+ expert_load_view: Optional[torch.Tensor] = None,
347
+ logical_to_physical_map: Optional[torch.Tensor] = None,
348
+ logical_replica_count: Optional[torch.Tensor] = None,
349
+ ) -> torch.Tensor:
350
+ if enable_eplb:
351
+ raise NotImplementedError(
352
+ "EPLB not supported for `UnquantizedFusedMoEMethod` yet.")
353
+
354
+ return self.forward(
355
+ x=x,
356
+ layer=layer,
357
+ router_logits=router_logits,
358
+ top_k=top_k,
359
+ renormalize=renormalize,
360
+ use_grouped_topk=use_grouped_topk,
361
+ topk_group=topk_group,
362
+ num_expert_group=num_expert_group,
363
+ global_num_experts=global_num_experts,
364
+ expert_map=expert_map,
365
+ custom_routing_function=custom_routing_function,
366
+ scoring_func=scoring_func,
367
+ e_score_correction_bias=e_score_correction_bias,
368
+ activation=activation,
369
+ apply_router_weight_on_input=apply_router_weight_on_input)
370
+
371
+ def forward_cuda(
372
+ self,
373
+ layer: torch.nn.Module,
374
+ x: torch.Tensor,
375
+ use_grouped_topk: bool,
376
+ top_k: int,
377
+ router_logits: torch.Tensor,
378
+ renormalize: bool,
379
+ topk_group: Optional[int] = None,
380
+ num_expert_group: Optional[int] = None,
381
+ global_num_experts: int = -1,
382
+ expert_map: Optional[torch.Tensor] = None,
383
+ custom_routing_function: Optional[Callable] = None,
384
+ scoring_func: str = "softmax",
385
+ e_score_correction_bias: Optional[torch.Tensor] = None,
386
+ apply_router_weight_on_input: bool = False,
387
+ activation: str = "silu",
388
+ ) -> torch.Tensor:
389
+
390
+ topk_weights, topk_ids = FusedMoE.select_experts(
391
+ hidden_states=x,
392
+ router_logits=router_logits,
393
+ use_grouped_topk=use_grouped_topk,
394
+ top_k=top_k,
395
+ renormalize=renormalize,
396
+ topk_group=topk_group,
397
+ num_expert_group=num_expert_group,
398
+ custom_routing_function=custom_routing_function,
399
+ scoring_func=scoring_func,
400
+ e_score_correction_bias=e_score_correction_bias,
401
+ indices_type=self.topk_indices_dtype)
402
+
403
+ if self.rocm_aiter_moe_enabled:
404
+ return self.rocm_aiter_fused_experts(
405
+ hidden_states=x,
406
+ w1=layer.w13_weight,
407
+ w2=layer.w2_weight,
408
+ topk_weights=topk_weights,
409
+ topk_ids=topk_ids,
410
+ expert_map=expert_map,
411
+ activation=activation,
412
+ apply_router_weight_on_input=apply_router_weight_on_input)
413
+ else:
414
+ return self.fused_experts(
415
+ hidden_states=x,
416
+ w1=layer.w13_weight,
417
+ w2=layer.w2_weight,
418
+ topk_weights=topk_weights,
419
+ topk_ids=topk_ids,
420
+ inplace=True,
421
+ activation=activation,
422
+ apply_router_weight_on_input=apply_router_weight_on_input,
423
+ global_num_experts=global_num_experts,
424
+ expert_map=expert_map,
425
+ )
426
+
427
+ def forward_cpu(
428
+ self,
429
+ layer: torch.nn.Module,
430
+ x: torch.Tensor,
431
+ use_grouped_topk: bool,
432
+ top_k: int,
433
+ router_logits: torch.Tensor,
434
+ renormalize: bool,
435
+ topk_group: Optional[int] = None,
436
+ num_expert_group: Optional[int] = None,
437
+ global_num_experts: int = -1,
438
+ expert_map: Optional[torch.Tensor] = None,
439
+ custom_routing_function: Optional[Callable] = None,
440
+ scoring_func: str = "softmax",
441
+ e_score_correction_bias: Optional[torch.Tensor] = None,
442
+ apply_router_weight_on_input: bool = False,
443
+ activation: str = "silu",
444
+ **kwargs,
445
+ ):
446
+ return layer.cpu_fused_moe(
447
+ layer,
448
+ x,
449
+ use_grouped_topk,
450
+ top_k,
451
+ router_logits,
452
+ renormalize,
453
+ topk_group,
454
+ num_expert_group,
455
+ global_num_experts,
456
+ expert_map,
457
+ custom_routing_function,
458
+ scoring_func,
459
+ e_score_correction_bias,
460
+ apply_router_weight_on_input,
461
+ activation,
462
+ )
463
+
464
+ def forward_hpu(
465
+ self,
466
+ layer: torch.nn.Module,
467
+ x: torch.Tensor,
468
+ use_grouped_topk: bool,
469
+ top_k: int,
470
+ router_logits: torch.Tensor,
471
+ renormalize: bool,
472
+ topk_group: Optional[int] = None,
473
+ num_expert_group: Optional[int] = None,
474
+ global_num_experts: int = -1,
475
+ expert_map: Optional[torch.Tensor] = None,
476
+ custom_routing_function: Optional[Callable] = None,
477
+ scoring_func: str = "softmax",
478
+ e_score_correction_bias: Optional[torch.Tensor] = None,
479
+ apply_router_weight_on_input: bool = False,
480
+ activation: str = "silu",
481
+ ) -> torch.Tensor:
482
+ assert not use_grouped_topk
483
+ assert num_expert_group is None
484
+ assert topk_group is None
485
+ assert custom_routing_function is None
486
+ assert layer is not None
487
+ assert apply_router_weight_on_input is False
488
+ if scoring_func != "softmax":
489
+ raise NotImplementedError(
490
+ "Only softmax scoring function is supported for HPU.")
491
+ if e_score_correction_bias is not None:
492
+ raise NotImplementedError(
493
+ "Expert score correction bias is not supported for HPU.")
494
+ return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
495
+ router_logits, top_k)
496
+
497
+ def forward_tpu(
498
+ self,
499
+ layer: torch.nn.Module,
500
+ x: torch.Tensor,
501
+ use_grouped_topk: bool,
502
+ top_k: int,
503
+ router_logits: torch.Tensor,
504
+ renormalize: bool,
505
+ topk_group: Optional[int] = None,
506
+ num_expert_group: Optional[int] = None,
507
+ global_num_experts: int = -1,
508
+ expert_map: Optional[torch.Tensor] = None,
509
+ custom_routing_function: Optional[Callable] = None,
510
+ scoring_func: str = "softmax",
511
+ e_score_correction_bias: Optional[torch.Tensor] = None,
512
+ apply_router_weight_on_input: bool = False,
513
+ activation: str = "silu",
514
+ ) -> torch.Tensor:
515
+ assert not use_grouped_topk
516
+ assert num_expert_group is None
517
+ assert topk_group is None
518
+ assert custom_routing_function is None
519
+ assert apply_router_weight_on_input is False
520
+ if scoring_func != "softmax":
521
+ raise NotImplementedError(
522
+ "Only softmax scoring function is supported for TPU.")
523
+ if e_score_correction_bias is not None:
524
+ raise NotImplementedError(
525
+ "Expert score correction bias is not supported for TPU.")
526
+ assert activation == "silu", f"{activation} is not supported for TPU."
527
+ return fused_moe_pallas(hidden_states=x,
528
+ w1=layer.w13_weight,
529
+ w2=layer.w2_weight,
530
+ topk=top_k,
531
+ gating_output=router_logits,
532
+ global_num_experts=global_num_experts,
533
+ expert_map=expert_map,
534
+ renormalize=renormalize)
535
+
536
+ if current_platform.is_tpu():
537
+ forward_native = forward_tpu
538
+ elif current_platform.is_cpu():
539
+ forward_native = forward_cpu
540
+ else:
541
+ forward_native = forward_cuda
542
+
543
+
544
+ def determine_expert_map(
545
+ ep_size: int, ep_rank: int,
546
+ global_num_experts: int) -> tuple[int, Optional[torch.Tensor]]:
547
+ """
548
+ Calculates how many experts should be assigned to each rank for EP and
549
+ creates a mapping from global to local expert index. Experts are
550
+ distributed evenly across ranks. Any remaining are assigned to the
551
+ last rank.
552
+
553
+ Args:
554
+ ep_size (int): The size of the expert parallel group
555
+ global_num_experts (int): The total number of experts in the model.
556
+
557
+ Returns:
558
+ tuple[int, Optional[torch.Tensor]]: A tuple containing:
559
+ - local_num_experts (int): The number of experts assigned
560
+ to the current rank.
561
+ - expert_map (Optional[torch.Tensor]): A tensor of shape
562
+ (global_num_experts,) mapping from global to local index.
563
+ Contains -1 for experts not assigned to the current rank.
564
+ Returns None if ep_size is 1.
565
+ """
566
+ assert ep_size > 0
567
+ if ep_size == 1:
568
+ return (global_num_experts, None)
569
+
570
+ local_num_experts = global_num_experts // ep_size
571
+
572
+ # Create a tensor of size num_experts filled with -1
573
+ expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
574
+ # Create a expert map for the local experts
575
+ if ep_rank < (ep_size - 1):
576
+ # Each non-last rank gets local_num_experts experts.
577
+ expert_map[ep_rank * local_num_experts:
578
+ (ep_rank + 1) * local_num_experts] = \
579
+ torch.arange(0, local_num_experts, dtype=torch.int32)
580
+ else:
581
+ # All remaining experts are assigned to the last rank.
582
+ local_num_experts = (global_num_experts - ep_rank * local_num_experts)
583
+
584
+ expert_map[-local_num_experts:] = \
585
+ torch.arange(0, local_num_experts, dtype=torch.int32)
586
+ return (local_num_experts, expert_map)
587
+
588
+
589
+ class FusedMoE(torch.nn.Module):
590
+ """FusedMoE layer for MoE models.
591
+
592
+ This layer contains both MergedColumnParallel weights (gate_up_proj /
593
+ w13) and RowParallelLinear weights (down_proj/ w2).
594
+
595
+ Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
596
+ copy that naming convention here and handle any remapping in the
597
+ load_weights function in each model implementation.
598
+
599
+ Args:
600
+ num_experts: Number of experts in the model
601
+ top_k: Number of experts selected for each token
602
+ hidden_size: Input hidden state size of the transformer
603
+ intermediate_size: Intermediate size of the experts
604
+ params_dtype: Data type for the parameters.
605
+ reduce_results: Whether to all all_reduce on the output of the layer
606
+ renomalize: Whether to renormalize the logits in the fused_moe kernel
607
+ quant_config: Quantization configure.
608
+ enable_eplb: Whether to enable expert parallelism load balancer.
609
+ """
610
+
611
+ def __init__(
612
+ self,
613
+ num_experts: int, # Global number of experts
614
+ top_k: int,
615
+ hidden_size: int,
616
+ intermediate_size: int,
617
+ params_dtype: Optional[torch.dtype] = None,
618
+ reduce_results: bool = False,
619
+ renormalize: bool = True,
620
+ use_grouped_topk: bool = False,
621
+ num_expert_group: Optional[int] = None,
622
+ topk_group: Optional[int] = None,
623
+ quant_config: Optional[QuantizationConfig] = None,
624
+ tp_size: Optional[int] = None,
625
+ ep_size: Optional[int] = None,
626
+ dp_size: Optional[int] = None,
627
+ prefix: str = "",
628
+ custom_routing_function: Optional[Callable] = None,
629
+ scoring_func: str = "softmax",
630
+ e_score_correction_bias: Optional[torch.Tensor] = None,
631
+ apply_router_weight_on_input: bool = False,
632
+ activation: str = "silu",
633
+ enable_eplb: bool = False,
634
+ num_redundant_experts: int = 0,
635
+ ):
636
+ super().__init__()
637
+ if params_dtype is None:
638
+ params_dtype = torch.get_default_dtype()
639
+ self.params_dtype = params_dtype
640
+
641
+ tp_size_ = (tp_size if tp_size is not None else
642
+ get_tensor_model_parallel_world_size())
643
+ dp_size_ = (dp_size
644
+ if dp_size is not None else get_dp_group().world_size)
645
+
646
+ vllm_config = get_current_vllm_config()
647
+ self.moe_parallel_config: FusedMoEParallelConfig = (
648
+ FusedMoEParallelConfig.make(
649
+ tp_size_=tp_size_,
650
+ dp_size_=dp_size_,
651
+ vllm_parallel_config=vllm_config.parallel_config))
652
+
653
+ self.global_num_experts = num_experts + num_redundant_experts
654
+
655
+ # For smuggling this layer into the fused moe custom op
656
+ compilation_config = vllm_config.compilation_config
657
+ if prefix in compilation_config.static_forward_context:
658
+ raise ValueError("Duplicate layer name: {}".format(prefix))
659
+ compilation_config.static_forward_context[prefix] = self
660
+ self.layer_name = prefix
661
+
662
+ self.enable_eplb = enable_eplb
663
+ self.expert_load_view: Optional[torch.Tensor] = None
664
+ self.logical_to_physical_map: Optional[torch.Tensor] = None
665
+ self.logical_replica_count: Optional[torch.Tensor] = None
666
+
667
+ # Determine expert maps
668
+ if self.use_ep:
669
+ if self.enable_eplb:
670
+ assert self.global_num_experts % self.ep_size == 0, \
671
+ "EPLB currently only supports even distribution of " \
672
+ "experts across ranks."
673
+ else:
674
+ assert num_redundant_experts == 0, \
675
+ "Redundant experts are only supported with EPLB."
676
+ self.local_num_experts, self.expert_map = determine_expert_map(
677
+ ep_size=self.ep_size,
678
+ ep_rank=self.ep_rank,
679
+ global_num_experts=self.global_num_experts)
680
+ else:
681
+ self.local_num_experts, self.expert_map = (self.global_num_experts,
682
+ None)
683
+
684
+ self.top_k = top_k
685
+
686
+ assert intermediate_size % self.tp_size == 0
687
+ self.hidden_size = hidden_size
688
+ self.intermediate_size_per_partition = intermediate_size // self.tp_size
689
+ self.reduce_results = reduce_results
690
+ self.renormalize = renormalize
691
+ self.use_grouped_topk = use_grouped_topk
692
+ if self.use_grouped_topk:
693
+ assert num_expert_group is not None and topk_group is not None
694
+ self.num_expert_group = num_expert_group
695
+ self.topk_group = topk_group
696
+ self.custom_routing_function = custom_routing_function
697
+ self.scoring_func = scoring_func
698
+ self.e_score_correction_bias = e_score_correction_bias
699
+ self.apply_router_weight_on_input = apply_router_weight_on_input
700
+ self.activation = activation
701
+
702
+ if self.scoring_func != "softmax" and not self.use_grouped_topk:
703
+ raise ValueError("Only softmax scoring function is supported for "
704
+ "non-grouped topk.")
705
+ if current_platform.is_hpu():
706
+ from vllm_hpu_extension.ops import DynamicFusedMOE
707
+ self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
708
+
709
+ if vllm_config.model_config is not None:
710
+ model_dtype = vllm_config.model_config.dtype
711
+ else:
712
+ # TODO (bnell): This is a hack to get test_mixtral_moe to work
713
+ # since model_config is not set in the pytest test.
714
+ model_dtype = params_dtype
715
+
716
+ moe = FusedMoEConfig.make(
717
+ num_experts=self.global_num_experts,
718
+ experts_per_token=top_k,
719
+ hidden_dim=hidden_size,
720
+ num_local_experts=self.local_num_experts,
721
+ moe_parallel_config=self.moe_parallel_config,
722
+ in_dtype=model_dtype,
723
+ max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
724
+ quant_config=quant_config,
725
+ )
726
+ self.moe_config = moe
727
+ self.quant_config = quant_config
728
+
729
+ # Note: get_quant_method will look at the layer's local_num_experts
730
+ # for heuristic purposes, so it must be initialized first.
731
+ quant_method: Optional[QuantizeMethodBase] = None
732
+ quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None
733
+ else quant_config.get_quant_method(self, prefix))
734
+
735
+ assert quant_method is not None
736
+ assert isinstance(quant_method, FusedMoEMethodBase)
737
+ self.quant_method = quant_method
738
+
739
+ if self.enable_eplb:
740
+ from vllm.model_executor.layers.quantization.fp8 import (
741
+ Fp8MoEMethod)
742
+ if not isinstance(quant_method, Fp8MoEMethod):
743
+ # TODO: Add support for additional quantization methods.
744
+ # The implementation for other quantization methods does not
745
+ # contain essential differences, but the current quant API
746
+ # design causes duplicated work when extending to new
747
+ # quantization methods, so I'm leaving it for now.
748
+ # If you plan to add support for more quantization methods,
749
+ # please refer to the implementation in `Fp8MoEMethod`.
750
+ raise NotImplementedError("EPLB is only supported for FP8 "
751
+ "quantization for now.")
752
+
753
+ moe_quant_params = {
754
+ "num_experts": self.local_num_experts,
755
+ "hidden_size": hidden_size,
756
+ "intermediate_size_per_partition":
757
+ self.intermediate_size_per_partition,
758
+ "params_dtype": params_dtype,
759
+ "weight_loader": self.weight_loader,
760
+ }
761
+ # need full intermediate size pre-sharding for WNA16 act order
762
+ if (self.quant_method.__class__.__name__
763
+ in ("GPTQMarlinMoEMethod",
764
+ "CompressedTensorsWNA16MarlinMoEMethod",
765
+ "CompressedTensorsWNA16MoEMethod")):
766
+ moe_quant_params["intermediate_size_full"] = intermediate_size
767
+
768
+ self.quant_method.create_weights(layer=self, **moe_quant_params)
769
+
770
+ # Chunked all2all staging tensor
771
+ self.batched_hidden_states: Optional[torch.Tensor] = None
772
+ self.batched_router_logits: Optional[torch.Tensor] = None
773
+ if (self.moe_parallel_config.use_pplx_kernels
774
+ or self.moe_parallel_config.use_deepep_ll_kernels):
775
+ self.batched_hidden_states = torch.zeros(
776
+ (moe.max_num_tokens, self.hidden_size),
777
+ dtype=moe.in_dtype,
778
+ device=torch.cuda.current_device())
779
+
780
+ # Note here we use `num_experts` which is logical expert count
781
+ self.batched_router_logits = torch.zeros(
782
+ (moe.max_num_tokens, num_experts),
783
+ dtype=moe.in_dtype,
784
+ device=torch.cuda.current_device())
785
+
786
+ @property
787
+ def tp_size(self):
788
+ return self.moe_parallel_config.tp_size
789
+
790
+ @property
791
+ def dp_size(self):
792
+ return self.moe_parallel_config.dp_size
793
+
794
+ @property
795
+ def ep_size(self):
796
+ return self.moe_parallel_config.ep_size
797
+
798
+ @property
799
+ def tp_rank(self):
800
+ return self.moe_parallel_config.tp_rank
801
+
802
+ @property
803
+ def dp_rank(self):
804
+ return self.moe_parallel_config.dp_rank
805
+
806
+ @property
807
+ def ep_rank(self):
808
+ return self.moe_parallel_config.ep_rank
809
+
810
+ @property
811
+ def use_ep(self):
812
+ return self.moe_parallel_config.use_ep
813
+
814
+ @property
815
+ def use_pplx_kernels(self):
816
+ return self.moe_parallel_config.use_pplx_kernels
817
+
818
+ @property
819
+ def use_deepep_ht_kernels(self):
820
+ return self.moe_parallel_config.use_deepep_ht_kernels
821
+
822
+ @property
823
+ def use_deepep_ll_kernels(self):
824
+ return self.moe_parallel_config.use_deepep_ll_kernels
825
+
826
+ def _load_per_tensor_weight_scale(self, shard_id: str,
827
+ param: torch.nn.Parameter,
828
+ loaded_weight: torch.Tensor,
829
+ expert_id: int):
830
+ param_data = param.data
831
+ # for per tensor weight quantization
832
+ if shard_id in ("w1", "w3"):
833
+ # We have to keep the weight scales of w1 and w3 because
834
+ # we need to re-quantize w1/w3 weights after weight loading.
835
+ idx = 0 if shard_id == "w1" else 1
836
+ param_data[expert_id][idx] = loaded_weight
837
+ # If we are in the row parallel case (down_proj)
838
+ elif shard_id == "w2":
839
+ param_data[expert_id] = loaded_weight
840
+
841
+ def _load_model_weight_or_group_weight_scale(self,
842
+ shard_dim: int,
843
+ expert_data: torch.Tensor,
844
+ shard_id: str,
845
+ loaded_weight: torch.Tensor,
846
+ tp_rank: int,
847
+ load_full_w2: bool = False):
848
+ """
849
+ Load grouped weight scales for group quantization or model weights
850
+ :param shard_dim: dimension to shard
851
+ :param expert_data: parameter for a particular expert
852
+ :param shard_id: either w1, w2, or w3
853
+ :param loaded_weight: checkpoint weight to load into the param
854
+ :param tp_rank: tensor parallel rank
855
+ :param load_full_w2: whether or not the w2 loaded should be sharded.
856
+ """
857
+ if shard_id == "w2":
858
+ # In the case where we have actorder/g_idx, we do not partition the
859
+ # w2 scales, as indicated by `load_full` argument, for all tp cases
860
+ self._load_w2(shard_dim=shard_dim,
861
+ loaded_weight=loaded_weight,
862
+ expert_data=expert_data,
863
+ tp_rank=tp_rank,
864
+ load_full=load_full_w2)
865
+ elif shard_id in ("w1", "w3"):
866
+ self._load_w13(shard_id=shard_id,
867
+ shard_dim=shard_dim,
868
+ loaded_weight=loaded_weight,
869
+ expert_data=expert_data,
870
+ tp_rank=tp_rank)
871
+
872
+ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
873
+ shard_dim: int, shard_id: str,
874
+ loaded_weight: torch.Tensor,
875
+ tp_rank: int):
876
+ # for per channel weight quantization
877
+ if shard_id == "w2":
878
+ expert_data.copy_(loaded_weight)
879
+ elif shard_id in ("w1", "w3"):
880
+ self._load_w13(shard_id=shard_id,
881
+ shard_dim=shard_dim,
882
+ loaded_weight=loaded_weight,
883
+ expert_data=expert_data,
884
+ tp_rank=tp_rank)
885
+
886
+ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
887
+ shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
888
+
889
+ # Index the loaded weight for tp sharding.
890
+ # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
891
+ shard_size = expert_data.shape[shard_dim] // 2
892
+ loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
893
+ shard_size)
894
+ # Narrow parameter and load.
895
+ # w1, gate_proj: Load into first logical weight of w13.
896
+ if shard_id == "w1":
897
+ expert_data = expert_data.narrow(shard_dim, 0, shard_size)
898
+ # w3, up_proj: Load into second logical weight of w13.
899
+ else:
900
+ assert shard_id == "w3"
901
+ expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
902
+ expert_data.copy_(loaded_weight)
903
+
904
+ def _load_w2(self,
905
+ expert_data: torch.Tensor,
906
+ shard_dim: int,
907
+ loaded_weight: torch.Tensor,
908
+ tp_rank: int,
909
+ load_full: bool = False):
910
+
911
+ # Index the loaded weight for tp sharding.
912
+ # down_proj: "RowParallel" so tp sharding on input_dim
913
+ # Narrow parameter and load.
914
+ shard_size = expert_data.shape[shard_dim]
915
+ if not load_full:
916
+ loaded_weight = loaded_weight.narrow(shard_dim,
917
+ shard_size * tp_rank,
918
+ shard_size)
919
+ # w2, down_proj: Load into only logical weight of w2.
920
+ expert_data.copy_(loaded_weight)
921
+
922
+ def _load_single_value(self, param: torch.nn.Parameter,
923
+ loaded_weight: torch.Tensor, expert_id: int):
924
+ param_data = param.data
925
+
926
+ # Input scales can be loaded directly and should be equal.
927
+ param_data[expert_id] = loaded_weight
928
+
929
+ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
930
+ shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
931
+
932
+ if shard_id == "w2":
933
+ self._load_w2(shard_dim=shard_dim,
934
+ loaded_weight=loaded_weight,
935
+ expert_data=expert_data,
936
+ tp_rank=tp_rank)
937
+ else:
938
+ assert shard_id in ("w1", "w3")
939
+ expert_data.copy_(loaded_weight)
940
+
941
+ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
942
+ if self.expert_map is None:
943
+ return expert_id
944
+ return self.expert_map[expert_id].item()
945
+
946
+ @overload
947
+ def weight_loader(self, param: torch.nn.Parameter,
948
+ loaded_weight: torch.Tensor, weight_name: str,
949
+ shard_id: str, expert_id: int,
950
+ return_success: Literal[False]) -> None:
951
+ ...
952
+
953
+ @overload
954
+ def weight_loader(self, param: torch.nn.Parameter,
955
+ loaded_weight: torch.Tensor, weight_name: str,
956
+ shard_id: str, expert_id: int,
957
+ return_success: Literal[True]) -> bool:
958
+ ...
959
+
960
+ def weight_loader(self,
961
+ param: torch.nn.Parameter,
962
+ loaded_weight: torch.Tensor,
963
+ weight_name: str,
964
+ shard_id: str,
965
+ expert_id: int,
966
+ return_success: bool = False) -> Optional[bool]:
967
+ expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
968
+ if expert_id == -1:
969
+ # Failed to load this param since it's not local to this rank
970
+ return False if return_success else None
971
+ # Hereafter, `expert_id` is local physical id
972
+
973
+ quant_method_name = self.quant_method.__class__.__name__
974
+ # compressed-tensors checkpoints with packed weights are stored flipped
975
+ # TODO (mgoin): check self.quant_method.quant_config.quant_format
976
+ # against known CompressionFormat enum values that have this quality
977
+ if self.quant_method.__class__.__name__ in (
978
+ "CompressedTensorsWNA16MarlinMoEMethod",
979
+ "CompressedTensorsWNA16MoEMethod"):
980
+ loaded_weight = loaded_weight.t().contiguous()
981
+
982
+ if shard_id not in ("w1", "w2", "w3"):
983
+ raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
984
+ f"got {shard_id}.")
985
+
986
+ WEIGHT_SCALE_SUPPORTED = [
987
+ e.value for e in FusedMoeWeightScaleSupported
988
+ ]
989
+ # Fetch the dim to shard the parameter/loaded weight
990
+ # based on the shard id. This will be whatever
991
+ # dimension intermediate_size_per_partition is used.
992
+ SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
993
+
994
+ is_gguf_weight = getattr(param, "is_gguf_weight", False)
995
+ is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
996
+ if is_gguf_weight_type:
997
+ param.weight_type = loaded_weight.item()
998
+ param.data.copy_(loaded_weight)
999
+ return True if return_success else None
1000
+
1001
+ # is_transposed: if the dim to shard the weight
1002
+ # should be flipped. Required by GPTQ, compressed-tensors
1003
+ # should be whatever dimension intermediate_size_per_partition is
1004
+ is_transposed = getattr(param, "is_transposed", False)
1005
+ shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
1006
+ if is_transposed:
1007
+ shard_dim = int(not shard_dim)
1008
+
1009
+ full_load = len(loaded_weight.shape) == 3
1010
+ if full_load:
1011
+ shard_dim += 1
1012
+
1013
+ # Materialize GGUF UninitializedParameter
1014
+ if is_gguf_weight and isinstance(param, UninitializedParameter):
1015
+ final_shape = list(loaded_weight.shape)
1016
+ if shard_id in ["w1", "w3"]:
1017
+ final_shape[1] *= 2
1018
+ final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
1019
+ param.materialize(final_shape, dtype=loaded_weight.dtype)
1020
+
1021
+ expert_data = param.data if full_load else param.data[expert_id]
1022
+
1023
+ # Case input scale: input_scale loading is only supported for fp8
1024
+ if "input_scale" in weight_name:
1025
+ # this is needed for compressed-tensors only
1026
+ loaded_weight = loaded_weight.to(param.data.device)
1027
+
1028
+ if ("compressed" in quant_method_name.lower()
1029
+ and param.data[expert_id] != 1
1030
+ and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
1031
+ raise ValueError(
1032
+ "input_scales of w1 and w3 of a layer "
1033
+ f"must be equal. But got {param.data[expert_id]} "
1034
+ f"vs. {loaded_weight}")
1035
+
1036
+ self._load_single_value(param=param,
1037
+ loaded_weight=loaded_weight,
1038
+ expert_id=expert_id)
1039
+ return True if return_success else None
1040
+
1041
+ # Case g_idx
1042
+ if "g_idx" in weight_name:
1043
+ self._load_g_idx(shard_dim=0,
1044
+ shard_id=shard_id,
1045
+ loaded_weight=loaded_weight,
1046
+ expert_data=expert_data,
1047
+ tp_rank=self.tp_rank)
1048
+ return True if return_success else None
1049
+
1050
+ # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
1051
+ if "ModelOpt" in quant_method_name:
1052
+ if ('weight_scale_2' in weight_name
1053
+ or 'input_scale' in weight_name):
1054
+ self._load_per_tensor_weight_scale(shard_id=shard_id,
1055
+ param=param,
1056
+ loaded_weight=loaded_weight,
1057
+ expert_id=expert_id)
1058
+ elif "weight" in weight_name:
1059
+ self._load_model_weight_or_group_weight_scale(
1060
+ shard_id=shard_id,
1061
+ shard_dim=shard_dim,
1062
+ loaded_weight=loaded_weight,
1063
+ expert_data=expert_data,
1064
+ tp_rank=self.tp_rank)
1065
+ return True if return_success else None
1066
+
1067
+ # Case weight scales, zero_points and offset, weight/input global scales
1068
+ if ("scale" in weight_name or "zero" in weight_name
1069
+ or "offset" in weight_name):
1070
+ # load the weight scales and zp based on the quantization scheme
1071
+ # supported weight scales/zp can be found in
1072
+ # FusedMoeWeightScaleSupported
1073
+ # TODO @dsikka: once hardened, refactor to use vLLM Parameters
1074
+ # specific to each case
1075
+ quant_method = getattr(param, "quant_method", None)
1076
+ if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
1077
+ self._load_per_channel_weight_scale(
1078
+ shard_id=shard_id,
1079
+ shard_dim=shard_dim,
1080
+ loaded_weight=loaded_weight,
1081
+ expert_data=expert_data,
1082
+ tp_rank=self.tp_rank)
1083
+ elif quant_method in [
1084
+ FusedMoeWeightScaleSupported.GROUP.value,
1085
+ FusedMoeWeightScaleSupported.BLOCK.value,
1086
+ ]:
1087
+ self._load_model_weight_or_group_weight_scale(
1088
+ shard_id=shard_id,
1089
+ shard_dim=shard_dim,
1090
+ loaded_weight=loaded_weight,
1091
+ expert_data=expert_data,
1092
+ tp_rank=self.tp_rank,
1093
+ load_full_w2=getattr(param, "load_full_w2", False))
1094
+ elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
1095
+ self._load_per_tensor_weight_scale(shard_id=shard_id,
1096
+ param=param,
1097
+ loaded_weight=loaded_weight,
1098
+ expert_id=expert_id)
1099
+ else:
1100
+ raise ValueError(
1101
+ f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
1102
+ return True if return_success else None
1103
+
1104
+ # Case weight_shape
1105
+ if "weight_shape" in weight_name:
1106
+ # only required by compressed-tensors
1107
+ self._load_single_value(param=param,
1108
+ loaded_weight=loaded_weight,
1109
+ expert_id=expert_id)
1110
+ return True if return_success else None
1111
+
1112
+ # Case model weights
1113
+ if "weight" in weight_name:
1114
+ self._load_model_weight_or_group_weight_scale(
1115
+ shard_id=shard_id,
1116
+ shard_dim=shard_dim,
1117
+ loaded_weight=loaded_weight,
1118
+ expert_data=expert_data,
1119
+ tp_rank=self.tp_rank)
1120
+ return True if return_success else None
1121
+
1122
+ return False if return_success else None
1123
+
1124
+ def get_expert_weights(self) -> Iterable[torch.Tensor]:
1125
+ weights = list(self.named_parameters())
1126
+ assert all(weight.is_contiguous() for _, weight in weights)
1127
+
1128
+ # Filter out the non-expert weights.
1129
+ # `e_score_correction_bias` is a bias for each logical expert,
1130
+ # with shape (num_logical_experts,), not an expert weight.
1131
+ NON_EXPERT_WEIGHTS = {
1132
+ "e_score_correction_bias",
1133
+ }
1134
+
1135
+ return [
1136
+ weight.view(self.local_num_experts, -1) for name, weight in weights
1137
+ if name not in NON_EXPERT_WEIGHTS
1138
+ ]
1139
+
1140
+ def set_eplb_state(
1141
+ self,
1142
+ moe_layer_idx: int,
1143
+ expert_load_view: torch.Tensor,
1144
+ logical_to_physical_map: torch.Tensor,
1145
+ logical_replica_count: torch.Tensor,
1146
+ ) -> None:
1147
+ """
1148
+ Register the EPLB state in this layer.
1149
+
1150
+ This is used later in forward pass, where we get the expert mapping
1151
+ and record the load metrics in `expert_load_view`.
1152
+ """
1153
+ self.expert_load_view = expert_load_view[moe_layer_idx]
1154
+ self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx]
1155
+ self.logical_replica_count = logical_replica_count[moe_layer_idx]
1156
+
1157
+ @staticmethod
1158
+ def select_experts(
1159
+ hidden_states: torch.Tensor,
1160
+ router_logits: torch.Tensor,
1161
+ top_k: int,
1162
+ use_grouped_topk: bool,
1163
+ renormalize: bool,
1164
+ topk_group: Optional[int] = None,
1165
+ num_expert_group: Optional[int] = None,
1166
+ custom_routing_function: Optional[Callable] = None,
1167
+ scoring_func: str = "softmax",
1168
+ e_score_correction_bias: Optional[torch.Tensor] = None,
1169
+ indices_type: Optional[torch.dtype] = None,
1170
+ enable_eplb: bool = False,
1171
+ expert_map: Optional[torch.Tensor] = None,
1172
+ expert_load_view: Optional[torch.Tensor] = None,
1173
+ logical_to_physical_map: Optional[torch.Tensor] = None,
1174
+ logical_replica_count: Optional[torch.Tensor] = None,
1175
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1176
+ """
1177
+ Route the input hidden states to the top-k experts based on the
1178
+ router logits.
1179
+
1180
+ Returns:
1181
+ (topk_weights, topk_ids) (tuple[torch.Tensor, torch.Tensor]):
1182
+ The weights and *global physical* expert ids of the top-k experts.
1183
+
1184
+ **Compatibility**: When EPLB is not enabled, the returned ids are
1185
+ equivalent to global logical ids, so should be compatible with
1186
+ plain MoE implementations without redundant experts.
1187
+ """
1188
+ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
1189
+
1190
+ # DeepSeekv2 uses grouped_top_k
1191
+ if use_grouped_topk:
1192
+ assert topk_group is not None
1193
+ assert num_expert_group is not None
1194
+ topk_weights, topk_ids = grouped_topk(
1195
+ hidden_states=hidden_states,
1196
+ gating_output=router_logits,
1197
+ topk=top_k,
1198
+ renormalize=renormalize,
1199
+ num_expert_group=num_expert_group,
1200
+ topk_group=topk_group,
1201
+ scoring_func=scoring_func,
1202
+ e_score_correction_bias=e_score_correction_bias)
1203
+ if indices_type is not None:
1204
+ topk_ids = topk_ids.to(dtype=indices_type)
1205
+ elif custom_routing_function is None:
1206
+ topk_weights, topk_ids, token_expert_indices = fused_topk(
1207
+ hidden_states=hidden_states,
1208
+ gating_output=router_logits,
1209
+ topk=top_k,
1210
+ renormalize=renormalize,
1211
+ indices_type=indices_type,
1212
+ )
1213
+ else:
1214
+ topk_weights, topk_ids = custom_routing_function(
1215
+ hidden_states=hidden_states,
1216
+ gating_output=router_logits,
1217
+ topk=top_k,
1218
+ renormalize=renormalize)
1219
+ if indices_type is not None:
1220
+ topk_ids = topk_ids.to(dtype=indices_type)
1221
+
1222
+ if enable_eplb:
1223
+ assert expert_load_view is not None
1224
+ assert logical_to_physical_map is not None
1225
+ assert logical_replica_count is not None
1226
+
1227
+ # 1. Convert the logical expert ids to physical expert ids
1228
+ # Directly select a random replica for each logical expert
1229
+
1230
+ # TODO: maybe optimize this by using specified kernels,
1231
+ # or compute pseudo-random indices by modulo
1232
+
1233
+ # In case `indices_type` is not `torch.long` or `torch.int`,
1234
+ # e.g. `torch.uint32` as required by dispatch/combine kernels
1235
+ topk_ids_long = topk_ids.long()
1236
+ replica_indices = (
1237
+ torch.rand_like(topk_ids, dtype=torch.float) *
1238
+ logical_replica_count[topk_ids_long]).long().unsqueeze(-1)
1239
+ physical_ids = logical_to_physical_map[topk_ids_long].gather(
1240
+ -1, replica_indices).squeeze(-1)
1241
+
1242
+ topk_ids = physical_ids
1243
+
1244
+ # 2. Record expert load metrics.
1245
+
1246
+ # TODO(bowen): When using `FusedMoEModularKernel`, this
1247
+ # can be done in a more unified way, since
1248
+ # `FusedMoEPrepareAndFinalize` will return the expert
1249
+ # token count, in some cases directly from the kernel.
1250
+ # However, now there are many code paths not using
1251
+ # the modular kernel, e.g. calling `fused_experts`,
1252
+ # so we decide to keep the logic here.
1253
+ #
1254
+ # If later refactor moved all the MoE kernel calls
1255
+ # to the modular kernel, we can move this logic there
1256
+ # to achieve better efficiency.
1257
+
1258
+ # `expert_load_view`: (num_logical_experts,)
1259
+
1260
+ # Mask out non-local experts
1261
+ if expert_map is not None:
1262
+ topk_ids_local = expert_map[topk_ids]
1263
+ topk_ids_flatten = topk_ids_local.flatten()
1264
+ else:
1265
+ topk_ids_flatten = topk_ids.flatten()
1266
+
1267
+ # Should be equivalent to:
1268
+ # ```
1269
+ # topk_ids_masked = topk_ids_local[topk_ids_local >= 0]
1270
+ # expert_load_view += topk_ids_masked.bincount(
1271
+ # minlength=expert_load_view.shape[0])
1272
+ # ```
1273
+ # We use `scatter_add_` since `bincount` cannot be compiled
1274
+
1275
+ # Performance optimization:
1276
+ # `masked_fill` is significantly faster than `masked_select`
1277
+ invalid_mask = topk_ids_flatten < 0
1278
+ # Replace invalid expert ids with 0 (just a dummy position)
1279
+ # to avoid out-of-bounds errors in scatter_add_
1280
+ index = topk_ids_flatten.masked_fill_(invalid_mask, 0)
1281
+ # `src` is the valid mask, which is 1 for valid and 0 for invalid
1282
+ src = ~invalid_mask
1283
+
1284
+ expert_load_view.scatter_add_(dim=0,
1285
+ index=index.long(),
1286
+ src=src.to(expert_load_view))
1287
+
1288
+ topk_ids = topk_ids.to(dtype=indices_type)
1289
+
1290
+ assert topk_ids.dtype == indices_type or indices_type is None
1291
+
1292
+ return topk_weights, topk_ids
1293
+
1294
+ def must_reduce_shared_expert_outputs(self) -> bool:
1295
+ """
1296
+ The shared_experts are typically computed using the RowParallelLinear
1297
+ layer. The result of this function is typically used as
1298
+ the reduce_results argument to the module.
1299
+ When just tensor-parallel is used, it is not required to reduce
1300
+ the shared_experts results immediately. Instead we reduce at the
1301
+ once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
1302
+ With EP and all2all kernels - this is no longer viable as all
1303
+ GPU ranks in DP, produce the complete set of hidden_states.
1304
+ Therefore it is required that we reduce the shared_experts output
1305
+ early.
1306
+ """
1307
+ return (self.use_pplx_kernels or self.use_deepep_ht_kernels
1308
+ or self.use_deepep_ll_kernels)
1309
+
1310
+ def maybe_all_reduce_tensor_model_parallel(
1311
+ self, final_hidden_states: torch.Tensor):
1312
+ """
1313
+ The pplx combine kernel reduces across GPU ranks by default.
1314
+ """
1315
+ if (self.use_pplx_kernels or self.use_deepep_ht_kernels
1316
+ or self.use_deepep_ll_kernels):
1317
+ return final_hidden_states
1318
+ else:
1319
+ return tensor_model_parallel_all_reduce(final_hidden_states)
1320
+
1321
+ def forward(self, hidden_states: torch.Tensor,
1322
+ router_logits: torch.Tensor):
1323
+ # TODO: Once the OOM issue for the TPU backend is resolved, we will
1324
+ # switch to using the moe_forward custom op.
1325
+ if current_platform.is_tpu():
1326
+ return self.forward_impl(hidden_states, router_logits)
1327
+ else:
1328
+ return torch.ops.vllm.moe_forward(hidden_states, router_logits,
1329
+ self.layer_name)
1330
+
1331
+ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
1332
+ full_router_logits: torch.Tensor):
1333
+ assert self.batched_hidden_states is not None
1334
+ assert self.batched_router_logits is not None
1335
+ assert self.batched_hidden_states.dtype == full_hidden_states.dtype
1336
+ assert self.batched_router_logits.dtype == full_router_logits.dtype
1337
+ # Check size compatibility.
1338
+ assert (
1339
+ self.batched_hidden_states.size(-1) == full_hidden_states.size(-1))
1340
+ assert (
1341
+ self.batched_router_logits.size(-1) == full_router_logits.size(-1))
1342
+
1343
+ full_final_hidden_states = torch.empty_like(full_hidden_states)
1344
+
1345
+ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
1346
+ chunk_size = chunk_end - chunk_start
1347
+ hidden_states = full_hidden_states[chunk_start:chunk_end, :]
1348
+ router_logits = full_router_logits[chunk_start:chunk_end, :]
1349
+
1350
+ assert (self.batched_hidden_states.size(0) # type: ignore
1351
+ >= chunk_size)
1352
+ assert (self.batched_router_logits.size(0) # type: ignore
1353
+ >= chunk_size)
1354
+ staged_hidden_states = self.batched_hidden_states[:
1355
+ chunk_size, :] # type: ignore
1356
+ staged_router_logits = self.batched_router_logits[:
1357
+ chunk_size, :] # type: ignore
1358
+ staged_hidden_states.copy_(hidden_states, non_blocking=True)
1359
+ staged_router_logits.copy_(router_logits, non_blocking=True)
1360
+
1361
+ # Matrix multiply.
1362
+ final_hidden_states = self.quant_method.apply(
1363
+ layer=self,
1364
+ x=staged_hidden_states,
1365
+ router_logits=staged_router_logits,
1366
+ top_k=self.top_k,
1367
+ renormalize=self.renormalize,
1368
+ use_grouped_topk=self.use_grouped_topk,
1369
+ global_num_experts=self.global_num_experts,
1370
+ expert_map=self.expert_map,
1371
+ topk_group=self.topk_group,
1372
+ num_expert_group=self.num_expert_group,
1373
+ custom_routing_function=self.custom_routing_function,
1374
+ scoring_func=self.scoring_func,
1375
+ e_score_correction_bias=self.e_score_correction_bias,
1376
+ activation=self.activation,
1377
+ enable_eplb=self.enable_eplb,
1378
+ expert_load_view=self.expert_load_view,
1379
+ logical_to_physical_map=self.logical_to_physical_map,
1380
+ logical_replica_count=self.logical_replica_count,
1381
+ )
1382
+
1383
+ if not skip_result_store:
1384
+ full_final_hidden_states[chunk_start:chunk_end, :].copy_(
1385
+ final_hidden_states, non_blocking=True)
1386
+
1387
+ ctx = get_forward_context()
1388
+ max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
1389
+ moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
1390
+
1391
+ num_tokens = full_hidden_states.size(0)
1392
+ for chunk_start_ in range(0, max_tokens_across_dp,
1393
+ moe_dp_chunk_size_per_rank):
1394
+ chunk_start = chunk_start_
1395
+ chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
1396
+ max_tokens_across_dp)
1397
+ # clamp start and end
1398
+ chunk_start = min(chunk_start, num_tokens - 1)
1399
+ chunk_end = min(chunk_end, num_tokens)
1400
+
1401
+ process_chunk(chunk_start,
1402
+ chunk_end,
1403
+ skip_result_store=chunk_start_ >= num_tokens)
1404
+
1405
+ return full_final_hidden_states
1406
+
1407
+ def forward_impl(self, hidden_states: torch.Tensor,
1408
+ router_logits: torch.Tensor):
1409
+ assert self.quant_method is not None
1410
+ if (self.moe_parallel_config.use_pplx_kernels
1411
+ or self.moe_parallel_config.use_deepep_ll_kernels):
1412
+ return self.forward_impl_chunked(hidden_states, router_logits)
1413
+
1414
+ do_naive_dispatch_combine: bool = (
1415
+ self.dp_size > 1
1416
+ and not self.moe_parallel_config.use_deepep_ht_kernels)
1417
+ if do_naive_dispatch_combine:
1418
+ hidden_states, router_logits = get_ep_group().dispatch(
1419
+ hidden_states, router_logits)
1420
+
1421
+ # Matrix multiply.
1422
+ final_hidden_states = self.quant_method.apply(
1423
+ layer=self,
1424
+ x=hidden_states,
1425
+ router_logits=router_logits,
1426
+ top_k=self.top_k,
1427
+ renormalize=self.renormalize,
1428
+ use_grouped_topk=self.use_grouped_topk,
1429
+ global_num_experts=self.global_num_experts,
1430
+ expert_map=self.expert_map,
1431
+ topk_group=self.topk_group,
1432
+ num_expert_group=self.num_expert_group,
1433
+ custom_routing_function=self.custom_routing_function,
1434
+ scoring_func=self.scoring_func,
1435
+ e_score_correction_bias=self.e_score_correction_bias,
1436
+ activation=self.activation,
1437
+ apply_router_weight_on_input=self.apply_router_weight_on_input,
1438
+ enable_eplb=self.enable_eplb,
1439
+ expert_load_view=self.expert_load_view,
1440
+ logical_to_physical_map=self.logical_to_physical_map,
1441
+ logical_replica_count=self.logical_replica_count,
1442
+ )
1443
+
1444
+ if do_naive_dispatch_combine:
1445
+ final_hidden_states = get_ep_group().combine(final_hidden_states)
1446
+
1447
+ if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
1448
+ # Default set to False. (May have to add shared expert outputs.
1449
+ final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
1450
+ final_hidden_states)
1451
+
1452
+ return final_hidden_states
1453
+
1454
+ @classmethod
1455
+ def make_expert_params_mapping(
1456
+ cls,
1457
+ ckpt_gate_proj_name: str,
1458
+ ckpt_down_proj_name: str,
1459
+ ckpt_up_proj_name: str,
1460
+ num_experts: int,
1461
+ num_redundant_experts: int = 0) -> list[tuple[str, str, int, str]]:
1462
+
1463
+ num_physical_experts = num_experts + num_redundant_experts
1464
+
1465
+ # In the returned mapping:
1466
+ # - `expert_id` is the physical expert id
1467
+ # - `weight_name` contains the weight name of the logical expert
1468
+ # So that we should map the expert id to logical in `weight_name`
1469
+ physical_to_logical_map = \
1470
+ EplbState.build_initial_global_physical_to_logical_map(
1471
+ num_experts, num_redundant_experts)
1472
+
1473
+ return [
1474
+ # (param_name, weight_name, expert_id, shard_id)
1475
+ ("experts.w13_" if weight_name
1476
+ in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
1477
+ f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.",
1478
+ expert_id, shard_id) for expert_id in range(num_physical_experts)
1479
+ for shard_id, weight_name in [
1480
+ ("w1", ckpt_gate_proj_name),
1481
+ ("w2", ckpt_down_proj_name),
1482
+ ("w3", ckpt_up_proj_name),
1483
+ ]
1484
+ ]
1485
+
1486
+ def extra_repr(self) -> str:
1487
+
1488
+ s = (
1489
+ f"global_num_experts={self.global_num_experts}, "
1490
+ f"local_num_experts={self.local_num_experts}, "
1491
+ f"top_k={self.top_k}, "
1492
+ f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501
1493
+ f"tp_size={self.tp_size},\n"
1494
+ f"ep_size={self.ep_size}, "
1495
+ f"reduce_results={self.reduce_results}, "
1496
+ f"renormalize={self.renormalize}, "
1497
+ f"use_grouped_topk={self.use_grouped_topk}")
1498
+
1499
+ if self.use_grouped_topk:
1500
+ s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}" # noqa: E501
1501
+
1502
+ s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'" # noqa: E501
1503
+
1504
+ return s
1505
+
1506
+
1507
+ def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
1508
+ layer_name: str) -> torch.Tensor:
1509
+ forward_context: ForwardContext = get_forward_context()
1510
+ self = forward_context.no_compile_layers[layer_name]
1511
+ assert self.quant_method is not None
1512
+
1513
+ return self.forward_impl(hidden_states, router_logits)
1514
+
1515
+
1516
+ def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
1517
+ layer_name: str) -> torch.Tensor:
1518
+ return torch.empty_like(hidden_states)
1519
+
1520
+
1521
+ direct_register_custom_op(
1522
+ op_name="moe_forward",
1523
+ op_func=moe_forward,
1524
+ mutates_args=["hidden_states"],
1525
+ fake_impl=moe_forward_fake,
1526
+ dispatch_key=current_platform.dispatch_key,
1527
+ tags=(torch.Tag.needs_fixed_stride_order, ),
1528
+ )