vllm-cpu-avx512bf16 0.14.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1712) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +1511 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +3206 -0
  6. vllm/_ipex_ops.py +445 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +62 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/layer.py +913 -0
  15. vllm/attention/utils/__init__.py +0 -0
  16. vllm/attention/utils/kv_sharing_utils.py +33 -0
  17. vllm/attention/utils/kv_transfer_utils.py +60 -0
  18. vllm/beam_search.py +88 -0
  19. vllm/benchmarks/__init__.py +0 -0
  20. vllm/benchmarks/datasets.py +3277 -0
  21. vllm/benchmarks/latency.py +172 -0
  22. vllm/benchmarks/lib/__init__.py +3 -0
  23. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  24. vllm/benchmarks/lib/ready_checker.py +72 -0
  25. vllm/benchmarks/lib/utils.py +79 -0
  26. vllm/benchmarks/mm_processor.py +363 -0
  27. vllm/benchmarks/serve.py +1761 -0
  28. vllm/benchmarks/startup.py +321 -0
  29. vllm/benchmarks/sweep/__init__.py +0 -0
  30. vllm/benchmarks/sweep/cli.py +41 -0
  31. vllm/benchmarks/sweep/param_sweep.py +159 -0
  32. vllm/benchmarks/sweep/plot.py +675 -0
  33. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  34. vllm/benchmarks/sweep/serve.py +450 -0
  35. vllm/benchmarks/sweep/serve_sla.py +459 -0
  36. vllm/benchmarks/sweep/server.py +114 -0
  37. vllm/benchmarks/sweep/sla_sweep.py +138 -0
  38. vllm/benchmarks/sweep/utils.py +4 -0
  39. vllm/benchmarks/throughput.py +946 -0
  40. vllm/collect_env.py +857 -0
  41. vllm/compilation/__init__.py +0 -0
  42. vllm/compilation/activation_quant_fusion.py +214 -0
  43. vllm/compilation/backends.py +840 -0
  44. vllm/compilation/base_static_graph.py +57 -0
  45. vllm/compilation/caching.py +196 -0
  46. vllm/compilation/collective_fusion.py +1224 -0
  47. vllm/compilation/compiler_interface.py +639 -0
  48. vllm/compilation/counter.py +50 -0
  49. vllm/compilation/cuda_graph.py +309 -0
  50. vllm/compilation/decorators.py +662 -0
  51. vllm/compilation/fix_functionalization.py +266 -0
  52. vllm/compilation/fusion.py +570 -0
  53. vllm/compilation/fusion_attn.py +363 -0
  54. vllm/compilation/fx_utils.py +92 -0
  55. vllm/compilation/inductor_pass.py +145 -0
  56. vllm/compilation/matcher_utils.py +454 -0
  57. vllm/compilation/monitor.py +62 -0
  58. vllm/compilation/noop_elimination.py +130 -0
  59. vllm/compilation/partition_rules.py +75 -0
  60. vllm/compilation/pass_manager.py +164 -0
  61. vllm/compilation/piecewise_backend.py +191 -0
  62. vllm/compilation/post_cleanup.py +21 -0
  63. vllm/compilation/qk_norm_rope_fusion.py +244 -0
  64. vllm/compilation/rocm_aiter_fusion.py +401 -0
  65. vllm/compilation/sequence_parallelism.py +368 -0
  66. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  67. vllm/compilation/vllm_inductor_pass.py +180 -0
  68. vllm/compilation/wrapper.py +329 -0
  69. vllm/config/__init__.py +112 -0
  70. vllm/config/attention.py +114 -0
  71. vllm/config/cache.py +233 -0
  72. vllm/config/compilation.py +1149 -0
  73. vllm/config/device.py +75 -0
  74. vllm/config/ec_transfer.py +110 -0
  75. vllm/config/kv_events.py +56 -0
  76. vllm/config/kv_transfer.py +119 -0
  77. vllm/config/load.py +124 -0
  78. vllm/config/lora.py +102 -0
  79. vllm/config/model.py +2026 -0
  80. vllm/config/model_arch.py +57 -0
  81. vllm/config/multimodal.py +247 -0
  82. vllm/config/observability.py +157 -0
  83. vllm/config/parallel.py +703 -0
  84. vllm/config/pooler.py +188 -0
  85. vllm/config/profiler.py +199 -0
  86. vllm/config/scheduler.py +298 -0
  87. vllm/config/speculative.py +656 -0
  88. vllm/config/speech_to_text.py +39 -0
  89. vllm/config/structured_outputs.py +78 -0
  90. vllm/config/utils.py +374 -0
  91. vllm/config/vllm.py +1487 -0
  92. vllm/connections.py +189 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +301 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +43 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +509 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +303 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +346 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +190 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  106. vllm/distributed/device_communicators/pynccl.py +386 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +567 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +778 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  113. vllm/distributed/device_communicators/symm_mem.py +156 -0
  114. vllm/distributed/device_communicators/xpu_communicator.py +98 -0
  115. vllm/distributed/ec_transfer/__init__.py +14 -0
  116. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  117. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  118. vllm/distributed/ec_transfer/ec_connector/example_connector.py +201 -0
  119. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  120. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  121. vllm/distributed/eplb/__init__.py +3 -0
  122. vllm/distributed/eplb/async_worker.py +115 -0
  123. vllm/distributed/eplb/eplb_state.py +1192 -0
  124. vllm/distributed/eplb/policy/__init__.py +19 -0
  125. vllm/distributed/eplb/policy/abstract.py +43 -0
  126. vllm/distributed/eplb/policy/default.py +376 -0
  127. vllm/distributed/eplb/rebalance_execute.py +699 -0
  128. vllm/distributed/kv_events.py +505 -0
  129. vllm/distributed/kv_transfer/README.md +29 -0
  130. vllm/distributed/kv_transfer/__init__.py +20 -0
  131. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  132. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  133. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  134. vllm/distributed/kv_transfer/kv_connector/factory.py +203 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +459 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +607 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +450 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +344 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  142. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +395 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +211 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1431 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +941 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +186 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +916 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/moriio/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +321 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +1515 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +609 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +477 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2688 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +557 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  159. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  160. vllm/distributed/parallel_state.py +1809 -0
  161. vllm/distributed/utils.py +545 -0
  162. vllm/engine/__init__.py +0 -0
  163. vllm/engine/arg_utils.py +2137 -0
  164. vllm/engine/async_llm_engine.py +6 -0
  165. vllm/engine/llm_engine.py +6 -0
  166. vllm/engine/protocol.py +194 -0
  167. vllm/entrypoints/__init__.py +0 -0
  168. vllm/entrypoints/anthropic/__init__.py +0 -0
  169. vllm/entrypoints/anthropic/protocol.py +162 -0
  170. vllm/entrypoints/anthropic/serving_messages.py +468 -0
  171. vllm/entrypoints/api_server.py +186 -0
  172. vllm/entrypoints/chat_utils.py +1912 -0
  173. vllm/entrypoints/cli/__init__.py +19 -0
  174. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/base.py +25 -0
  176. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  177. vllm/entrypoints/cli/benchmark/main.py +57 -0
  178. vllm/entrypoints/cli/benchmark/mm_processor.py +21 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  180. vllm/entrypoints/cli/benchmark/startup.py +21 -0
  181. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  182. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  183. vllm/entrypoints/cli/collect_env.py +38 -0
  184. vllm/entrypoints/cli/main.py +79 -0
  185. vllm/entrypoints/cli/openai.py +260 -0
  186. vllm/entrypoints/cli/run_batch.py +68 -0
  187. vllm/entrypoints/cli/serve.py +253 -0
  188. vllm/entrypoints/cli/types.py +29 -0
  189. vllm/entrypoints/constants.py +12 -0
  190. vllm/entrypoints/context.py +898 -0
  191. vllm/entrypoints/grpc_server.py +531 -0
  192. vllm/entrypoints/launcher.py +175 -0
  193. vllm/entrypoints/llm.py +1807 -0
  194. vllm/entrypoints/logger.py +86 -0
  195. vllm/entrypoints/openai/__init__.py +0 -0
  196. vllm/entrypoints/openai/api_server.py +1390 -0
  197. vllm/entrypoints/openai/cli_args.py +320 -0
  198. vllm/entrypoints/openai/orca_metrics.py +120 -0
  199. vllm/entrypoints/openai/parser/__init__.py +0 -0
  200. vllm/entrypoints/openai/parser/harmony_utils.py +820 -0
  201. vllm/entrypoints/openai/parser/responses_parser.py +176 -0
  202. vllm/entrypoints/openai/protocol.py +2566 -0
  203. vllm/entrypoints/openai/run_batch.py +635 -0
  204. vllm/entrypoints/openai/serving_chat.py +1897 -0
  205. vllm/entrypoints/openai/serving_chat_stream_harmony.py +101 -0
  206. vllm/entrypoints/openai/serving_completion.py +740 -0
  207. vllm/entrypoints/openai/serving_engine.py +1612 -0
  208. vllm/entrypoints/openai/serving_models.py +309 -0
  209. vllm/entrypoints/openai/serving_responses.py +2552 -0
  210. vllm/entrypoints/openai/serving_transcription.py +168 -0
  211. vllm/entrypoints/openai/speech_to_text.py +711 -0
  212. vllm/entrypoints/openai/utils.py +49 -0
  213. vllm/entrypoints/pooling/__init__.py +16 -0
  214. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  215. vllm/entrypoints/pooling/classify/api_router.py +48 -0
  216. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  217. vllm/entrypoints/pooling/classify/serving.py +233 -0
  218. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  219. vllm/entrypoints/pooling/embed/api_router.py +65 -0
  220. vllm/entrypoints/pooling/embed/conftest.py +28 -0
  221. vllm/entrypoints/pooling/embed/protocol.py +217 -0
  222. vllm/entrypoints/pooling/embed/serving.py +684 -0
  223. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  224. vllm/entrypoints/pooling/pooling/api_router.py +62 -0
  225. vllm/entrypoints/pooling/pooling/protocol.py +146 -0
  226. vllm/entrypoints/pooling/pooling/serving.py +354 -0
  227. vllm/entrypoints/pooling/score/__init__.py +0 -0
  228. vllm/entrypoints/pooling/score/api_router.py +147 -0
  229. vllm/entrypoints/pooling/score/protocol.py +146 -0
  230. vllm/entrypoints/pooling/score/serving.py +511 -0
  231. vllm/entrypoints/renderer.py +411 -0
  232. vllm/entrypoints/responses_utils.py +218 -0
  233. vllm/entrypoints/sagemaker/__init__.py +4 -0
  234. vllm/entrypoints/sagemaker/routes.py +118 -0
  235. vllm/entrypoints/score_utils.py +271 -0
  236. vllm/entrypoints/serve/__init__.py +94 -0
  237. vllm/entrypoints/serve/cache/__init__.py +0 -0
  238. vllm/entrypoints/serve/cache/api_router.py +61 -0
  239. vllm/entrypoints/serve/disagg/__init__.py +0 -0
  240. vllm/entrypoints/serve/disagg/api_router.py +109 -0
  241. vllm/entrypoints/serve/disagg/protocol.py +90 -0
  242. vllm/entrypoints/serve/disagg/serving.py +285 -0
  243. vllm/entrypoints/serve/elastic_ep/__init__.py +0 -0
  244. vllm/entrypoints/serve/elastic_ep/api_router.py +96 -0
  245. vllm/entrypoints/serve/elastic_ep/middleware.py +49 -0
  246. vllm/entrypoints/serve/instrumentator/__init__.py +0 -0
  247. vllm/entrypoints/serve/instrumentator/health.py +33 -0
  248. vllm/entrypoints/serve/instrumentator/metrics.py +45 -0
  249. vllm/entrypoints/serve/instrumentator/offline_docs.py +50 -0
  250. vllm/entrypoints/serve/instrumentator/server_info.py +56 -0
  251. vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js +2 -0
  252. vllm/entrypoints/serve/instrumentator/static/swagger-ui.css +3 -0
  253. vllm/entrypoints/serve/lora/__init__.py +0 -0
  254. vllm/entrypoints/serve/lora/api_router.py +70 -0
  255. vllm/entrypoints/serve/profile/__init__.py +0 -0
  256. vllm/entrypoints/serve/profile/api_router.py +46 -0
  257. vllm/entrypoints/serve/rlhf/__init__.py +0 -0
  258. vllm/entrypoints/serve/rlhf/api_router.py +102 -0
  259. vllm/entrypoints/serve/rpc/__init__.py +0 -0
  260. vllm/entrypoints/serve/rpc/api_router.py +61 -0
  261. vllm/entrypoints/serve/sleep/__init__.py +0 -0
  262. vllm/entrypoints/serve/sleep/api_router.py +56 -0
  263. vllm/entrypoints/serve/tokenize/__init__.py +0 -0
  264. vllm/entrypoints/serve/tokenize/api_router.py +112 -0
  265. vllm/entrypoints/serve/tokenize/serving.py +204 -0
  266. vllm/entrypoints/ssl.py +78 -0
  267. vllm/entrypoints/tool.py +187 -0
  268. vllm/entrypoints/tool_server.py +234 -0
  269. vllm/entrypoints/utils.py +336 -0
  270. vllm/env_override.py +402 -0
  271. vllm/envs.py +1791 -0
  272. vllm/exceptions.py +36 -0
  273. vllm/forward_context.py +375 -0
  274. vllm/grpc/__init__.py +17 -0
  275. vllm/grpc/compile_protos.py +94 -0
  276. vllm/grpc/vllm_engine.proto +195 -0
  277. vllm/grpc/vllm_engine_pb2.py +77 -0
  278. vllm/grpc/vllm_engine_pb2.pyi +213 -0
  279. vllm/grpc/vllm_engine_pb2_grpc.py +330 -0
  280. vllm/inputs/__init__.py +44 -0
  281. vllm/inputs/data.py +359 -0
  282. vllm/inputs/parse.py +147 -0
  283. vllm/inputs/preprocess.py +716 -0
  284. vllm/logger.py +303 -0
  285. vllm/logging_utils/__init__.py +13 -0
  286. vllm/logging_utils/dump_input.py +83 -0
  287. vllm/logging_utils/formatter.py +127 -0
  288. vllm/logging_utils/lazy.py +20 -0
  289. vllm/logging_utils/log_time.py +34 -0
  290. vllm/logits_process.py +121 -0
  291. vllm/logprobs.py +206 -0
  292. vllm/lora/__init__.py +0 -0
  293. vllm/lora/layers/__init__.py +43 -0
  294. vllm/lora/layers/base.py +66 -0
  295. vllm/lora/layers/base_linear.py +172 -0
  296. vllm/lora/layers/column_parallel_linear.py +577 -0
  297. vllm/lora/layers/fused_moe.py +739 -0
  298. vllm/lora/layers/logits_processor.py +203 -0
  299. vllm/lora/layers/replicated_linear.py +70 -0
  300. vllm/lora/layers/row_parallel_linear.py +176 -0
  301. vllm/lora/layers/utils.py +115 -0
  302. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  303. vllm/lora/lora_model.py +221 -0
  304. vllm/lora/lora_weights.py +227 -0
  305. vllm/lora/model_manager.py +858 -0
  306. vllm/lora/ops/__init__.py +0 -0
  307. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  308. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  309. vllm/lora/ops/torch_ops/__init__.py +20 -0
  310. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  311. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  312. vllm/lora/ops/triton_ops/__init__.py +21 -0
  313. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +677 -0
  314. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  315. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  316. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  317. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  318. vllm/lora/ops/triton_ops/utils.py +313 -0
  319. vllm/lora/peft_helper.py +128 -0
  320. vllm/lora/punica_wrapper/__init__.py +10 -0
  321. vllm/lora/punica_wrapper/punica_base.py +493 -0
  322. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  323. vllm/lora/punica_wrapper/punica_gpu.py +413 -0
  324. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  325. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  326. vllm/lora/punica_wrapper/utils.py +150 -0
  327. vllm/lora/request.py +60 -0
  328. vllm/lora/resolver.py +88 -0
  329. vllm/lora/utils.py +281 -0
  330. vllm/lora/worker_manager.py +278 -0
  331. vllm/model_executor/__init__.py +9 -0
  332. vllm/model_executor/custom_op.py +203 -0
  333. vllm/model_executor/layers/__init__.py +0 -0
  334. vllm/model_executor/layers/activation.py +628 -0
  335. vllm/model_executor/layers/attention/__init__.py +0 -0
  336. vllm/model_executor/layers/attention/chunked_local_attention.py +130 -0
  337. vllm/model_executor/layers/attention/cross_attention.py +182 -0
  338. vllm/model_executor/layers/attention/encoder_only_attention.py +103 -0
  339. vllm/model_executor/layers/attention/mm_encoder_attention.py +234 -0
  340. vllm/model_executor/layers/attention/static_sink_attention.py +254 -0
  341. vllm/model_executor/layers/attention_layer_base.py +34 -0
  342. vllm/model_executor/layers/batch_invariant.py +1063 -0
  343. vllm/model_executor/layers/conv.py +262 -0
  344. vllm/model_executor/layers/fla/__init__.py +8 -0
  345. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  346. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  347. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  348. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  349. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  350. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  351. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  352. vllm/model_executor/layers/fla/ops/index.py +41 -0
  353. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  354. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  355. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  356. vllm/model_executor/layers/fla/ops/op.py +60 -0
  357. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  358. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  359. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  360. vllm/model_executor/layers/fused_moe/__init__.py +120 -0
  361. vllm/model_executor/layers/fused_moe/all2all_utils.py +173 -0
  362. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +411 -0
  363. vllm/model_executor/layers/fused_moe/config.py +1111 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json +147 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  645. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  646. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  647. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  648. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  649. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  650. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  651. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +444 -0
  652. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1086 -0
  653. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +364 -0
  654. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +427 -0
  655. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  656. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +436 -0
  657. vllm/model_executor/layers/fused_moe/fallback.py +127 -0
  658. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +338 -0
  659. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +310 -0
  660. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +371 -0
  661. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  662. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1018 -0
  663. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +824 -0
  664. vllm/model_executor/layers/fused_moe/fused_moe.py +2638 -0
  665. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +119 -0
  666. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +117 -0
  667. vllm/model_executor/layers/fused_moe/fused_moe_router.py +40 -0
  668. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +531 -0
  669. vllm/model_executor/layers/fused_moe/layer.py +2169 -0
  670. vllm/model_executor/layers/fused_moe/modular_kernel.py +1251 -0
  671. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +192 -0
  672. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  673. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  674. vllm/model_executor/layers/fused_moe/oracle/__init__.py +2 -0
  675. vllm/model_executor/layers/fused_moe/oracle/fp8.py +358 -0
  676. vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +280 -0
  677. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  678. vllm/model_executor/layers/fused_moe/prepare_finalize.py +87 -0
  679. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +347 -0
  680. vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +324 -0
  681. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  682. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  683. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  684. vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py +78 -0
  685. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +75 -0
  686. vllm/model_executor/layers/fused_moe/trtllm_moe.py +144 -0
  687. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +403 -0
  688. vllm/model_executor/layers/fused_moe/utils.py +382 -0
  689. vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +189 -0
  690. vllm/model_executor/layers/kda.py +442 -0
  691. vllm/model_executor/layers/layernorm.py +451 -0
  692. vllm/model_executor/layers/lightning_attn.py +735 -0
  693. vllm/model_executor/layers/linear.py +1478 -0
  694. vllm/model_executor/layers/logits_processor.py +109 -0
  695. vllm/model_executor/layers/mamba/__init__.py +0 -0
  696. vllm/model_executor/layers/mamba/abstract.py +68 -0
  697. vllm/model_executor/layers/mamba/linear_attn.py +410 -0
  698. vllm/model_executor/layers/mamba/mamba_mixer.py +541 -0
  699. vllm/model_executor/layers/mamba/mamba_mixer2.py +936 -0
  700. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  701. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  702. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  703. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  704. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +586 -0
  705. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  706. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  707. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  708. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  709. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  710. vllm/model_executor/layers/mamba/short_conv.py +254 -0
  711. vllm/model_executor/layers/mla.py +179 -0
  712. vllm/model_executor/layers/pooler/__init__.py +5 -0
  713. vllm/model_executor/layers/pooler/abstract.py +39 -0
  714. vllm/model_executor/layers/pooler/activations.py +162 -0
  715. vllm/model_executor/layers/pooler/common.py +32 -0
  716. vllm/model_executor/layers/pooler/seqwise/__init__.py +45 -0
  717. vllm/model_executor/layers/pooler/seqwise/heads.py +151 -0
  718. vllm/model_executor/layers/pooler/seqwise/methods.py +93 -0
  719. vllm/model_executor/layers/pooler/seqwise/poolers.py +127 -0
  720. vllm/model_executor/layers/pooler/special.py +128 -0
  721. vllm/model_executor/layers/pooler/tokwise/__init__.py +39 -0
  722. vllm/model_executor/layers/pooler/tokwise/heads.py +133 -0
  723. vllm/model_executor/layers/pooler/tokwise/methods.py +122 -0
  724. vllm/model_executor/layers/pooler/tokwise/poolers.py +127 -0
  725. vllm/model_executor/layers/quantization/__init__.py +195 -0
  726. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  727. vllm/model_executor/layers/quantization/awq.py +277 -0
  728. vllm/model_executor/layers/quantization/awq_marlin.py +795 -0
  729. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  730. vllm/model_executor/layers/quantization/base_config.py +170 -0
  731. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  732. vllm/model_executor/layers/quantization/bitsandbytes.py +631 -0
  733. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  734. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +982 -0
  735. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2368 -0
  736. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +37 -0
  737. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  738. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  739. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  740. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py +106 -0
  741. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  742. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  743. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +176 -0
  744. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  745. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  746. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +203 -0
  747. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  748. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  749. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  750. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  751. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  752. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  753. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  754. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  755. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  756. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  757. vllm/model_executor/layers/quantization/cpu_wna16.py +299 -0
  758. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  759. vllm/model_executor/layers/quantization/experts_int8.py +209 -0
  760. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  761. vllm/model_executor/layers/quantization/fp8.py +1224 -0
  762. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  763. vllm/model_executor/layers/quantization/gguf.py +682 -0
  764. vllm/model_executor/layers/quantization/gptq.py +393 -0
  765. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  766. vllm/model_executor/layers/quantization/gptq_marlin.py +934 -0
  767. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  768. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  769. vllm/model_executor/layers/quantization/inc.py +65 -0
  770. vllm/model_executor/layers/quantization/input_quant_fp8.py +212 -0
  771. vllm/model_executor/layers/quantization/ipex_quant.py +403 -0
  772. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  773. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  774. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +113 -0
  775. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  776. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  777. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  778. vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py +126 -0
  779. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +130 -0
  780. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  781. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +168 -0
  782. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  783. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  784. vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py +97 -0
  785. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +76 -0
  786. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +77 -0
  787. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +128 -0
  788. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +220 -0
  789. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +147 -0
  790. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +88 -0
  791. vllm/model_executor/layers/quantization/kv_cache.py +153 -0
  792. vllm/model_executor/layers/quantization/modelopt.py +1665 -0
  793. vllm/model_executor/layers/quantization/moe_wna16.py +518 -0
  794. vllm/model_executor/layers/quantization/mxfp4.py +1145 -0
  795. vllm/model_executor/layers/quantization/petit.py +319 -0
  796. vllm/model_executor/layers/quantization/ptpc_fp8.py +140 -0
  797. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  798. vllm/model_executor/layers/quantization/quark/quark.py +570 -0
  799. vllm/model_executor/layers/quantization/quark/quark_moe.py +797 -0
  800. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  801. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  802. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  803. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  804. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  805. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  806. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  807. vllm/model_executor/layers/quantization/rtn.py +626 -0
  808. vllm/model_executor/layers/quantization/schema.py +90 -0
  809. vllm/model_executor/layers/quantization/torchao.py +380 -0
  810. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  811. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  812. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1002. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1003. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1004. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1005. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1006. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1007. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1008. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1009. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1010. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1011. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1012. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1013. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1014. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1015. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1016. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1017. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1018. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1019. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1020. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1021. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1022. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1023. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1024. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1025. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1026. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1027. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1028. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +514 -0
  1029. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +370 -0
  1030. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1658 -0
  1031. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1032. vllm/model_executor/layers/quantization/utils/int8_utils.py +477 -0
  1033. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1034. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1035. vllm/model_executor/layers/quantization/utils/marlin_utils.py +720 -0
  1036. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +565 -0
  1037. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1038. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1039. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1040. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +189 -0
  1041. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1042. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1043. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1044. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1045. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1046. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1047. vllm/model_executor/layers/quantization/utils/quant_utils.py +767 -0
  1048. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +519 -0
  1049. vllm/model_executor/layers/resampler.py +283 -0
  1050. vllm/model_executor/layers/rotary_embedding/__init__.py +291 -0
  1051. vllm/model_executor/layers/rotary_embedding/base.py +282 -0
  1052. vllm/model_executor/layers/rotary_embedding/common.py +289 -0
  1053. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +184 -0
  1054. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +218 -0
  1055. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1056. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1057. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +82 -0
  1058. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1059. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1060. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +83 -0
  1061. vllm/model_executor/layers/rotary_embedding/mrope.py +412 -0
  1062. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1063. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1064. vllm/model_executor/layers/rotary_embedding/xdrope.py +160 -0
  1065. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1066. vllm/model_executor/layers/utils.py +251 -0
  1067. vllm/model_executor/layers/vocab_parallel_embedding.py +564 -0
  1068. vllm/model_executor/model_loader/__init__.py +150 -0
  1069. vllm/model_executor/model_loader/base_loader.py +71 -0
  1070. vllm/model_executor/model_loader/bitsandbytes_loader.py +821 -0
  1071. vllm/model_executor/model_loader/default_loader.py +304 -0
  1072. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1073. vllm/model_executor/model_loader/gguf_loader.py +371 -0
  1074. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1075. vllm/model_executor/model_loader/runai_streamer_loader.py +115 -0
  1076. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1077. vllm/model_executor/model_loader/tensorizer.py +793 -0
  1078. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1079. vllm/model_executor/model_loader/utils.py +299 -0
  1080. vllm/model_executor/model_loader/weight_utils.py +1183 -0
  1081. vllm/model_executor/models/__init__.py +44 -0
  1082. vllm/model_executor/models/adapters.py +592 -0
  1083. vllm/model_executor/models/afmoe.py +697 -0
  1084. vllm/model_executor/models/aimv2.py +248 -0
  1085. vllm/model_executor/models/apertus.py +567 -0
  1086. vllm/model_executor/models/arcee.py +428 -0
  1087. vllm/model_executor/models/arctic.py +633 -0
  1088. vllm/model_executor/models/aria.py +663 -0
  1089. vllm/model_executor/models/audioflamingo3.py +639 -0
  1090. vllm/model_executor/models/aya_vision.py +448 -0
  1091. vllm/model_executor/models/bagel.py +591 -0
  1092. vllm/model_executor/models/baichuan.py +493 -0
  1093. vllm/model_executor/models/bailing_moe.py +643 -0
  1094. vllm/model_executor/models/bamba.py +511 -0
  1095. vllm/model_executor/models/bee.py +157 -0
  1096. vllm/model_executor/models/bert.py +911 -0
  1097. vllm/model_executor/models/bert_with_rope.py +729 -0
  1098. vllm/model_executor/models/blip.py +350 -0
  1099. vllm/model_executor/models/blip2.py +736 -0
  1100. vllm/model_executor/models/bloom.py +390 -0
  1101. vllm/model_executor/models/chameleon.py +1095 -0
  1102. vllm/model_executor/models/chatglm.py +502 -0
  1103. vllm/model_executor/models/clip.py +1045 -0
  1104. vllm/model_executor/models/cohere2_vision.py +470 -0
  1105. vllm/model_executor/models/commandr.py +469 -0
  1106. vllm/model_executor/models/config.py +571 -0
  1107. vllm/model_executor/models/dbrx.py +484 -0
  1108. vllm/model_executor/models/deepencoder.py +679 -0
  1109. vllm/model_executor/models/deepseek_eagle.py +253 -0
  1110. vllm/model_executor/models/deepseek_mtp.py +447 -0
  1111. vllm/model_executor/models/deepseek_ocr.py +601 -0
  1112. vllm/model_executor/models/deepseek_v2.py +1727 -0
  1113. vllm/model_executor/models/deepseek_vl2.py +642 -0
  1114. vllm/model_executor/models/dots1.py +566 -0
  1115. vllm/model_executor/models/dots_ocr.py +830 -0
  1116. vllm/model_executor/models/ernie45.py +53 -0
  1117. vllm/model_executor/models/ernie45_moe.py +755 -0
  1118. vllm/model_executor/models/ernie45_vl.py +1702 -0
  1119. vllm/model_executor/models/ernie45_vl_moe.py +801 -0
  1120. vllm/model_executor/models/ernie_mtp.py +278 -0
  1121. vllm/model_executor/models/exaone.py +524 -0
  1122. vllm/model_executor/models/exaone4.py +518 -0
  1123. vllm/model_executor/models/exaone_moe.py +579 -0
  1124. vllm/model_executor/models/exaone_moe_mtp.py +255 -0
  1125. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1126. vllm/model_executor/models/falcon.py +543 -0
  1127. vllm/model_executor/models/falcon_h1.py +675 -0
  1128. vllm/model_executor/models/flex_olmo.py +155 -0
  1129. vllm/model_executor/models/fuyu.py +371 -0
  1130. vllm/model_executor/models/gemma.py +425 -0
  1131. vllm/model_executor/models/gemma2.py +435 -0
  1132. vllm/model_executor/models/gemma3.py +520 -0
  1133. vllm/model_executor/models/gemma3_mm.py +664 -0
  1134. vllm/model_executor/models/gemma3n.py +1166 -0
  1135. vllm/model_executor/models/gemma3n_audio_utils.py +57 -0
  1136. vllm/model_executor/models/gemma3n_mm.py +820 -0
  1137. vllm/model_executor/models/glm.py +24 -0
  1138. vllm/model_executor/models/glm4.py +295 -0
  1139. vllm/model_executor/models/glm4_1v.py +1823 -0
  1140. vllm/model_executor/models/glm4_moe.py +725 -0
  1141. vllm/model_executor/models/glm4_moe_mtp.py +365 -0
  1142. vllm/model_executor/models/glm4v.py +783 -0
  1143. vllm/model_executor/models/glmasr.py +1154 -0
  1144. vllm/model_executor/models/glmasr_utils.py +188 -0
  1145. vllm/model_executor/models/gpt2.py +385 -0
  1146. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1147. vllm/model_executor/models/gpt_j.py +346 -0
  1148. vllm/model_executor/models/gpt_neox.py +340 -0
  1149. vllm/model_executor/models/gpt_oss.py +745 -0
  1150. vllm/model_executor/models/granite.py +475 -0
  1151. vllm/model_executor/models/granite_speech.py +919 -0
  1152. vllm/model_executor/models/granitemoe.py +561 -0
  1153. vllm/model_executor/models/granitemoehybrid.py +703 -0
  1154. vllm/model_executor/models/granitemoeshared.py +328 -0
  1155. vllm/model_executor/models/gritlm.py +242 -0
  1156. vllm/model_executor/models/grok1.py +803 -0
  1157. vllm/model_executor/models/h2ovl.py +554 -0
  1158. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1159. vllm/model_executor/models/hunyuan_vision.py +1034 -0
  1160. vllm/model_executor/models/hyperclovax_vision.py +1163 -0
  1161. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1162. vllm/model_executor/models/idefics3.py +734 -0
  1163. vllm/model_executor/models/interfaces.py +1180 -0
  1164. vllm/model_executor/models/interfaces_base.py +252 -0
  1165. vllm/model_executor/models/intern_vit.py +454 -0
  1166. vllm/model_executor/models/internlm2.py +451 -0
  1167. vllm/model_executor/models/internlm2_ve.py +139 -0
  1168. vllm/model_executor/models/interns1.py +828 -0
  1169. vllm/model_executor/models/interns1_vit.py +433 -0
  1170. vllm/model_executor/models/internvl.py +1436 -0
  1171. vllm/model_executor/models/iquest_loopcoder.py +595 -0
  1172. vllm/model_executor/models/isaac.py +1503 -0
  1173. vllm/model_executor/models/jais.py +397 -0
  1174. vllm/model_executor/models/jais2.py +508 -0
  1175. vllm/model_executor/models/jamba.py +599 -0
  1176. vllm/model_executor/models/jina_vl.py +145 -0
  1177. vllm/model_executor/models/kanana_v.py +756 -0
  1178. vllm/model_executor/models/keye.py +1709 -0
  1179. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1180. vllm/model_executor/models/kimi_linear.py +659 -0
  1181. vllm/model_executor/models/kimi_vl.py +577 -0
  1182. vllm/model_executor/models/lfm2.py +515 -0
  1183. vllm/model_executor/models/lfm2_moe.py +746 -0
  1184. vllm/model_executor/models/lfm2_vl.py +732 -0
  1185. vllm/model_executor/models/lightonocr.py +197 -0
  1186. vllm/model_executor/models/llama.py +724 -0
  1187. vllm/model_executor/models/llama4.py +860 -0
  1188. vllm/model_executor/models/llama4_eagle.py +225 -0
  1189. vllm/model_executor/models/llama_eagle.py +213 -0
  1190. vllm/model_executor/models/llama_eagle3.py +375 -0
  1191. vllm/model_executor/models/llava.py +879 -0
  1192. vllm/model_executor/models/llava_next.py +583 -0
  1193. vllm/model_executor/models/llava_next_video.py +467 -0
  1194. vllm/model_executor/models/llava_onevision.py +922 -0
  1195. vllm/model_executor/models/longcat_flash.py +767 -0
  1196. vllm/model_executor/models/longcat_flash_mtp.py +348 -0
  1197. vllm/model_executor/models/mamba.py +276 -0
  1198. vllm/model_executor/models/mamba2.py +288 -0
  1199. vllm/model_executor/models/medusa.py +179 -0
  1200. vllm/model_executor/models/midashenglm.py +826 -0
  1201. vllm/model_executor/models/mimo.py +188 -0
  1202. vllm/model_executor/models/mimo_mtp.py +294 -0
  1203. vllm/model_executor/models/mimo_v2_flash.py +718 -0
  1204. vllm/model_executor/models/minicpm.py +660 -0
  1205. vllm/model_executor/models/minicpm3.py +233 -0
  1206. vllm/model_executor/models/minicpm_eagle.py +386 -0
  1207. vllm/model_executor/models/minicpmo.py +768 -0
  1208. vllm/model_executor/models/minicpmv.py +1742 -0
  1209. vllm/model_executor/models/minimax_m2.py +552 -0
  1210. vllm/model_executor/models/minimax_text_01.py +1008 -0
  1211. vllm/model_executor/models/minimax_vl_01.py +395 -0
  1212. vllm/model_executor/models/mistral3.py +638 -0
  1213. vllm/model_executor/models/mistral_large_3.py +63 -0
  1214. vllm/model_executor/models/mistral_large_3_eagle.py +137 -0
  1215. vllm/model_executor/models/mixtral.py +599 -0
  1216. vllm/model_executor/models/mllama4.py +1170 -0
  1217. vllm/model_executor/models/mlp_speculator.py +235 -0
  1218. vllm/model_executor/models/modernbert.py +458 -0
  1219. vllm/model_executor/models/module_mapping.py +74 -0
  1220. vllm/model_executor/models/molmo.py +1592 -0
  1221. vllm/model_executor/models/moonvit.py +601 -0
  1222. vllm/model_executor/models/mpt.py +335 -0
  1223. vllm/model_executor/models/nano_nemotron_vl.py +1725 -0
  1224. vllm/model_executor/models/nemotron.py +499 -0
  1225. vllm/model_executor/models/nemotron_h.py +902 -0
  1226. vllm/model_executor/models/nemotron_nas.py +474 -0
  1227. vllm/model_executor/models/nemotron_parse.py +958 -0
  1228. vllm/model_executor/models/nemotron_vl.py +651 -0
  1229. vllm/model_executor/models/nvlm_d.py +216 -0
  1230. vllm/model_executor/models/olmo.py +412 -0
  1231. vllm/model_executor/models/olmo2.py +454 -0
  1232. vllm/model_executor/models/olmoe.py +498 -0
  1233. vllm/model_executor/models/opencua.py +262 -0
  1234. vllm/model_executor/models/openpangu.py +1378 -0
  1235. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1236. vllm/model_executor/models/opt.py +426 -0
  1237. vllm/model_executor/models/orion.py +365 -0
  1238. vllm/model_executor/models/ouro.py +507 -0
  1239. vllm/model_executor/models/ovis.py +557 -0
  1240. vllm/model_executor/models/ovis2_5.py +661 -0
  1241. vllm/model_executor/models/paddleocr_vl.py +1261 -0
  1242. vllm/model_executor/models/paligemma.py +429 -0
  1243. vllm/model_executor/models/persimmon.py +373 -0
  1244. vllm/model_executor/models/phi.py +363 -0
  1245. vllm/model_executor/models/phi3.py +18 -0
  1246. vllm/model_executor/models/phi3v.py +729 -0
  1247. vllm/model_executor/models/phi4mm.py +1250 -0
  1248. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1249. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1250. vllm/model_executor/models/phimoe.py +671 -0
  1251. vllm/model_executor/models/pixtral.py +1437 -0
  1252. vllm/model_executor/models/plamo2.py +993 -0
  1253. vllm/model_executor/models/plamo3.py +437 -0
  1254. vllm/model_executor/models/qwen.py +377 -0
  1255. vllm/model_executor/models/qwen2.py +600 -0
  1256. vllm/model_executor/models/qwen2_5_omni_thinker.py +1200 -0
  1257. vllm/model_executor/models/qwen2_5_vl.py +1598 -0
  1258. vllm/model_executor/models/qwen2_audio.py +478 -0
  1259. vllm/model_executor/models/qwen2_moe.py +604 -0
  1260. vllm/model_executor/models/qwen2_rm.py +120 -0
  1261. vllm/model_executor/models/qwen2_vl.py +1588 -0
  1262. vllm/model_executor/models/qwen3.py +331 -0
  1263. vllm/model_executor/models/qwen3_moe.py +752 -0
  1264. vllm/model_executor/models/qwen3_next.py +1410 -0
  1265. vllm/model_executor/models/qwen3_next_mtp.py +293 -0
  1266. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1814 -0
  1267. vllm/model_executor/models/qwen3_vl.py +2120 -0
  1268. vllm/model_executor/models/qwen3_vl_moe.py +474 -0
  1269. vllm/model_executor/models/qwen_vl.py +821 -0
  1270. vllm/model_executor/models/radio.py +573 -0
  1271. vllm/model_executor/models/registry.py +1218 -0
  1272. vllm/model_executor/models/roberta.py +239 -0
  1273. vllm/model_executor/models/rvl.py +107 -0
  1274. vllm/model_executor/models/seed_oss.py +492 -0
  1275. vllm/model_executor/models/siglip.py +1259 -0
  1276. vllm/model_executor/models/siglip2.py +495 -0
  1277. vllm/model_executor/models/siglip2navit.py +660 -0
  1278. vllm/model_executor/models/skyworkr1v.py +951 -0
  1279. vllm/model_executor/models/smolvlm.py +38 -0
  1280. vllm/model_executor/models/solar.py +484 -0
  1281. vllm/model_executor/models/stablelm.py +354 -0
  1282. vllm/model_executor/models/starcoder2.py +365 -0
  1283. vllm/model_executor/models/step3_text.py +554 -0
  1284. vllm/model_executor/models/step3_vl.py +1147 -0
  1285. vllm/model_executor/models/swin.py +500 -0
  1286. vllm/model_executor/models/tarsier.py +624 -0
  1287. vllm/model_executor/models/telechat2.py +153 -0
  1288. vllm/model_executor/models/teleflm.py +78 -0
  1289. vllm/model_executor/models/terratorch.py +318 -0
  1290. vllm/model_executor/models/transformers/__init__.py +127 -0
  1291. vllm/model_executor/models/transformers/base.py +523 -0
  1292. vllm/model_executor/models/transformers/causal.py +65 -0
  1293. vllm/model_executor/models/transformers/legacy.py +90 -0
  1294. vllm/model_executor/models/transformers/moe.py +329 -0
  1295. vllm/model_executor/models/transformers/multimodal.py +441 -0
  1296. vllm/model_executor/models/transformers/pooling.py +102 -0
  1297. vllm/model_executor/models/transformers/utils.py +253 -0
  1298. vllm/model_executor/models/ultravox.py +786 -0
  1299. vllm/model_executor/models/utils.py +832 -0
  1300. vllm/model_executor/models/vision.py +546 -0
  1301. vllm/model_executor/models/voxtral.py +867 -0
  1302. vllm/model_executor/models/voxtral_streaming.py +304 -0
  1303. vllm/model_executor/models/whisper.py +993 -0
  1304. vllm/model_executor/models/whisper_utils.py +299 -0
  1305. vllm/model_executor/models/zamba2.py +986 -0
  1306. vllm/model_executor/parameter.py +642 -0
  1307. vllm/model_executor/utils.py +113 -0
  1308. vllm/model_executor/warmup/__init__.py +0 -0
  1309. vllm/model_executor/warmup/deep_gemm_warmup.py +371 -0
  1310. vllm/model_executor/warmup/kernel_warmup.py +97 -0
  1311. vllm/model_inspection.py +136 -0
  1312. vllm/multimodal/__init__.py +38 -0
  1313. vllm/multimodal/audio.py +287 -0
  1314. vllm/multimodal/base.py +60 -0
  1315. vllm/multimodal/cache.py +829 -0
  1316. vllm/multimodal/evs.py +294 -0
  1317. vllm/multimodal/hasher.py +123 -0
  1318. vllm/multimodal/image.py +155 -0
  1319. vllm/multimodal/inputs.py +1027 -0
  1320. vllm/multimodal/parse.py +674 -0
  1321. vllm/multimodal/processing.py +2469 -0
  1322. vllm/multimodal/profiling.py +351 -0
  1323. vllm/multimodal/registry.py +375 -0
  1324. vllm/multimodal/utils.py +550 -0
  1325. vllm/multimodal/video.py +512 -0
  1326. vllm/outputs.py +347 -0
  1327. vllm/platforms/__init__.py +277 -0
  1328. vllm/platforms/cpu.py +423 -0
  1329. vllm/platforms/cuda.py +618 -0
  1330. vllm/platforms/interface.py +707 -0
  1331. vllm/platforms/rocm.py +586 -0
  1332. vllm/platforms/tpu.py +20 -0
  1333. vllm/platforms/xpu.py +262 -0
  1334. vllm/plugins/__init__.py +81 -0
  1335. vllm/plugins/io_processors/__init__.py +68 -0
  1336. vllm/plugins/io_processors/interface.py +77 -0
  1337. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1338. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1339. vllm/pooling_params.py +229 -0
  1340. vllm/profiler/__init__.py +0 -0
  1341. vllm/profiler/layerwise_profile.py +392 -0
  1342. vllm/profiler/utils.py +151 -0
  1343. vllm/profiler/wrapper.py +241 -0
  1344. vllm/py.typed +2 -0
  1345. vllm/ray/__init__.py +0 -0
  1346. vllm/ray/lazy_utils.py +30 -0
  1347. vllm/ray/ray_env.py +79 -0
  1348. vllm/reasoning/__init__.py +96 -0
  1349. vllm/reasoning/abs_reasoning_parsers.py +318 -0
  1350. vllm/reasoning/basic_parsers.py +175 -0
  1351. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1352. vllm/reasoning/deepseek_v3_reasoning_parser.py +69 -0
  1353. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1354. vllm/reasoning/glm4_moe_reasoning_parser.py +13 -0
  1355. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1356. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1357. vllm/reasoning/holo2_reasoning_parser.py +89 -0
  1358. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1359. vllm/reasoning/identity_reasoning_parser.py +63 -0
  1360. vllm/reasoning/minimax_m2_reasoning_parser.py +110 -0
  1361. vllm/reasoning/mistral_reasoning_parser.py +154 -0
  1362. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1363. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1364. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1365. vllm/reasoning/step3_reasoning_parser.py +113 -0
  1366. vllm/sampling_params.py +629 -0
  1367. vllm/scalar_type.py +355 -0
  1368. vllm/scripts.py +17 -0
  1369. vllm/sequence.py +64 -0
  1370. vllm/tasks.py +13 -0
  1371. vllm/third_party/__init__.py +0 -0
  1372. vllm/third_party/pynvml.py +6140 -0
  1373. vllm/tokenizers/__init__.py +18 -0
  1374. vllm/tokenizers/deepseek_v32.py +187 -0
  1375. vllm/tokenizers/deepseek_v32_encoding.py +463 -0
  1376. vllm/tokenizers/detokenizer_utils.py +198 -0
  1377. vllm/tokenizers/grok2.py +443 -0
  1378. vllm/tokenizers/hf.py +119 -0
  1379. vllm/tokenizers/mistral.py +543 -0
  1380. vllm/tokenizers/protocol.py +123 -0
  1381. vllm/tokenizers/registry.py +238 -0
  1382. vllm/tool_parsers/__init__.py +158 -0
  1383. vllm/tool_parsers/abstract_tool_parser.py +274 -0
  1384. vllm/tool_parsers/deepseekv31_tool_parser.py +388 -0
  1385. vllm/tool_parsers/deepseekv32_tool_parser.py +591 -0
  1386. vllm/tool_parsers/deepseekv3_tool_parser.py +390 -0
  1387. vllm/tool_parsers/ernie45_tool_parser.py +210 -0
  1388. vllm/tool_parsers/functiongemma_tool_parser.py +321 -0
  1389. vllm/tool_parsers/gigachat3_tool_parser.py +190 -0
  1390. vllm/tool_parsers/glm47_moe_tool_parser.py +23 -0
  1391. vllm/tool_parsers/glm4_moe_tool_parser.py +215 -0
  1392. vllm/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  1393. vllm/tool_parsers/granite_tool_parser.py +253 -0
  1394. vllm/tool_parsers/hermes_tool_parser.py +495 -0
  1395. vllm/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  1396. vllm/tool_parsers/internlm2_tool_parser.py +227 -0
  1397. vllm/tool_parsers/jamba_tool_parser.py +323 -0
  1398. vllm/tool_parsers/kimi_k2_tool_parser.py +598 -0
  1399. vllm/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  1400. vllm/tool_parsers/llama_tool_parser.py +324 -0
  1401. vllm/tool_parsers/longcat_tool_parser.py +37 -0
  1402. vllm/tool_parsers/minimax_m2_tool_parser.py +776 -0
  1403. vllm/tool_parsers/minimax_tool_parser.py +849 -0
  1404. vllm/tool_parsers/mistral_tool_parser.py +612 -0
  1405. vllm/tool_parsers/olmo3_tool_parser.py +366 -0
  1406. vllm/tool_parsers/openai_tool_parser.py +111 -0
  1407. vllm/tool_parsers/phi4mini_tool_parser.py +120 -0
  1408. vllm/tool_parsers/pythonic_tool_parser.py +332 -0
  1409. vllm/tool_parsers/qwen3coder_tool_parser.py +781 -0
  1410. vllm/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  1411. vllm/tool_parsers/seed_oss_tool_parser.py +744 -0
  1412. vllm/tool_parsers/step3_tool_parser.py +303 -0
  1413. vllm/tool_parsers/utils.py +229 -0
  1414. vllm/tool_parsers/xlam_tool_parser.py +556 -0
  1415. vllm/tracing.py +135 -0
  1416. vllm/transformers_utils/__init__.py +26 -0
  1417. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1418. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1419. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1420. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1421. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1422. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1423. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1424. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1425. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1426. vllm/transformers_utils/config.py +1169 -0
  1427. vllm/transformers_utils/config_parser_base.py +20 -0
  1428. vllm/transformers_utils/configs/__init__.py +106 -0
  1429. vllm/transformers_utils/configs/afmoe.py +87 -0
  1430. vllm/transformers_utils/configs/arctic.py +216 -0
  1431. vllm/transformers_utils/configs/bagel.py +53 -0
  1432. vllm/transformers_utils/configs/chatglm.py +75 -0
  1433. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1434. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1435. vllm/transformers_utils/configs/eagle.py +90 -0
  1436. vllm/transformers_utils/configs/falcon.py +89 -0
  1437. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1438. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1439. vllm/transformers_utils/configs/isaac.py +100 -0
  1440. vllm/transformers_utils/configs/jais.py +243 -0
  1441. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1442. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1443. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1444. vllm/transformers_utils/configs/medusa.py +65 -0
  1445. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1446. vllm/transformers_utils/configs/mistral.py +263 -0
  1447. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1448. vllm/transformers_utils/configs/moonvit.py +33 -0
  1449. vllm/transformers_utils/configs/nemotron.py +220 -0
  1450. vllm/transformers_utils/configs/nemotron_h.py +284 -0
  1451. vllm/transformers_utils/configs/olmo3.py +83 -0
  1452. vllm/transformers_utils/configs/ovis.py +182 -0
  1453. vllm/transformers_utils/configs/qwen3_next.py +277 -0
  1454. vllm/transformers_utils/configs/radio.py +98 -0
  1455. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1456. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1457. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1458. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1459. vllm/transformers_utils/configs/tarsier2.py +24 -0
  1460. vllm/transformers_utils/configs/ultravox.py +120 -0
  1461. vllm/transformers_utils/dynamic_module.py +70 -0
  1462. vllm/transformers_utils/gguf_utils.py +280 -0
  1463. vllm/transformers_utils/model_arch_config_convertor.py +402 -0
  1464. vllm/transformers_utils/processor.py +424 -0
  1465. vllm/transformers_utils/processors/__init__.py +25 -0
  1466. vllm/transformers_utils/processors/bagel.py +78 -0
  1467. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1468. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1469. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1470. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1471. vllm/transformers_utils/processors/ovis.py +453 -0
  1472. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1473. vllm/transformers_utils/repo_utils.py +287 -0
  1474. vllm/transformers_utils/runai_utils.py +102 -0
  1475. vllm/transformers_utils/s3_utils.py +95 -0
  1476. vllm/transformers_utils/tokenizer.py +19 -0
  1477. vllm/transformers_utils/utils.py +112 -0
  1478. vllm/triton_utils/__init__.py +20 -0
  1479. vllm/triton_utils/importing.py +103 -0
  1480. vllm/usage/__init__.py +0 -0
  1481. vllm/usage/usage_lib.py +278 -0
  1482. vllm/utils/__init__.py +36 -0
  1483. vllm/utils/argparse_utils.py +491 -0
  1484. vllm/utils/async_utils.py +310 -0
  1485. vllm/utils/cache.py +214 -0
  1486. vllm/utils/collection_utils.py +112 -0
  1487. vllm/utils/counter.py +45 -0
  1488. vllm/utils/deep_gemm.py +424 -0
  1489. vllm/utils/flashinfer.py +602 -0
  1490. vllm/utils/func_utils.py +236 -0
  1491. vllm/utils/gc_utils.py +151 -0
  1492. vllm/utils/hashing.py +117 -0
  1493. vllm/utils/import_utils.py +438 -0
  1494. vllm/utils/jsontree.py +158 -0
  1495. vllm/utils/math_utils.py +32 -0
  1496. vllm/utils/mem_constants.py +13 -0
  1497. vllm/utils/mem_utils.py +285 -0
  1498. vllm/utils/nccl.py +64 -0
  1499. vllm/utils/network_utils.py +331 -0
  1500. vllm/utils/nvtx_pytorch_hooks.py +286 -0
  1501. vllm/utils/platform_utils.py +59 -0
  1502. vllm/utils/profiling.py +56 -0
  1503. vllm/utils/registry.py +51 -0
  1504. vllm/utils/serial_utils.py +214 -0
  1505. vllm/utils/system_utils.py +296 -0
  1506. vllm/utils/tensor_schema.py +255 -0
  1507. vllm/utils/torch_utils.py +781 -0
  1508. vllm/v1/__init__.py +0 -0
  1509. vllm/v1/attention/__init__.py +0 -0
  1510. vllm/v1/attention/backend.py +736 -0
  1511. vllm/v1/attention/backends/__init__.py +0 -0
  1512. vllm/v1/attention/backends/cpu_attn.py +501 -0
  1513. vllm/v1/attention/backends/fa_utils.py +126 -0
  1514. vllm/v1/attention/backends/flash_attn.py +1092 -0
  1515. vllm/v1/attention/backends/flash_attn_diffkv.py +277 -0
  1516. vllm/v1/attention/backends/flashinfer.py +1713 -0
  1517. vllm/v1/attention/backends/flex_attention.py +1024 -0
  1518. vllm/v1/attention/backends/gdn_attn.py +382 -0
  1519. vllm/v1/attention/backends/linear_attn.py +77 -0
  1520. vllm/v1/attention/backends/mamba1_attn.py +28 -0
  1521. vllm/v1/attention/backends/mamba2_attn.py +256 -0
  1522. vllm/v1/attention/backends/mamba_attn.py +313 -0
  1523. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1524. vllm/v1/attention/backends/mla/aiter_triton_mla.py +66 -0
  1525. vllm/v1/attention/backends/mla/common.py +2156 -0
  1526. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1527. vllm/v1/attention/backends/mla/flashattn_mla.py +348 -0
  1528. vllm/v1/attention/backends/mla/flashinfer_mla.py +175 -0
  1529. vllm/v1/attention/backends/mla/flashmla.py +321 -0
  1530. vllm/v1/attention/backends/mla/flashmla_sparse.py +1021 -0
  1531. vllm/v1/attention/backends/mla/indexer.py +345 -0
  1532. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +284 -0
  1533. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +321 -0
  1534. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1535. vllm/v1/attention/backends/registry.py +258 -0
  1536. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1537. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1538. vllm/v1/attention/backends/rocm_attn.py +405 -0
  1539. vllm/v1/attention/backends/short_conv_attn.py +26 -0
  1540. vllm/v1/attention/backends/tree_attn.py +430 -0
  1541. vllm/v1/attention/backends/triton_attn.py +578 -0
  1542. vllm/v1/attention/backends/utils.py +978 -0
  1543. vllm/v1/attention/ops/__init__.py +0 -0
  1544. vllm/v1/attention/ops/chunked_prefill_paged_decode.py +459 -0
  1545. vllm/v1/attention/ops/common.py +469 -0
  1546. vllm/v1/attention/ops/flashmla.py +254 -0
  1547. vllm/v1/attention/ops/merge_attn_states.py +47 -0
  1548. vllm/v1/attention/ops/paged_attn.py +51 -0
  1549. vllm/v1/attention/ops/pallas_kv_cache_update.py +130 -0
  1550. vllm/v1/attention/ops/prefix_prefill.py +862 -0
  1551. vllm/v1/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  1552. vllm/v1/attention/ops/triton_decode_attention.py +709 -0
  1553. vllm/v1/attention/ops/triton_merge_attn_states.py +116 -0
  1554. vllm/v1/attention/ops/triton_prefill_attention.py +272 -0
  1555. vllm/v1/attention/ops/triton_reshape_and_cache_flash.py +395 -0
  1556. vllm/v1/attention/ops/triton_unified_attention.py +1088 -0
  1557. vllm/v1/attention/ops/vit_attn_wrappers.py +185 -0
  1558. vllm/v1/attention/selector.py +145 -0
  1559. vllm/v1/core/__init__.py +0 -0
  1560. vllm/v1/core/block_pool.py +489 -0
  1561. vllm/v1/core/encoder_cache_manager.py +402 -0
  1562. vllm/v1/core/kv_cache_coordinator.py +560 -0
  1563. vllm/v1/core/kv_cache_manager.py +485 -0
  1564. vllm/v1/core/kv_cache_metrics.py +96 -0
  1565. vllm/v1/core/kv_cache_utils.py +1642 -0
  1566. vllm/v1/core/sched/__init__.py +0 -0
  1567. vllm/v1/core/sched/async_scheduler.py +66 -0
  1568. vllm/v1/core/sched/interface.py +205 -0
  1569. vllm/v1/core/sched/output.py +261 -0
  1570. vllm/v1/core/sched/request_queue.py +208 -0
  1571. vllm/v1/core/sched/scheduler.py +1936 -0
  1572. vllm/v1/core/sched/utils.py +64 -0
  1573. vllm/v1/core/single_type_kv_cache_manager.py +926 -0
  1574. vllm/v1/cudagraph_dispatcher.py +183 -0
  1575. vllm/v1/engine/__init__.py +224 -0
  1576. vllm/v1/engine/async_llm.py +874 -0
  1577. vllm/v1/engine/coordinator.py +396 -0
  1578. vllm/v1/engine/core.py +1614 -0
  1579. vllm/v1/engine/core_client.py +1422 -0
  1580. vllm/v1/engine/detokenizer.py +351 -0
  1581. vllm/v1/engine/exceptions.py +18 -0
  1582. vllm/v1/engine/input_processor.py +713 -0
  1583. vllm/v1/engine/llm_engine.py +415 -0
  1584. vllm/v1/engine/logprobs.py +245 -0
  1585. vllm/v1/engine/output_processor.py +715 -0
  1586. vllm/v1/engine/parallel_sampling.py +150 -0
  1587. vllm/v1/engine/utils.py +1086 -0
  1588. vllm/v1/executor/__init__.py +6 -0
  1589. vllm/v1/executor/abstract.py +352 -0
  1590. vllm/v1/executor/multiproc_executor.py +888 -0
  1591. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1592. vllm/v1/executor/ray_executor.py +623 -0
  1593. vllm/v1/executor/ray_utils.py +468 -0
  1594. vllm/v1/executor/uniproc_executor.py +186 -0
  1595. vllm/v1/kv_cache_interface.py +485 -0
  1596. vllm/v1/kv_offload/__init__.py +0 -0
  1597. vllm/v1/kv_offload/abstract.py +161 -0
  1598. vllm/v1/kv_offload/arc_manager.py +237 -0
  1599. vllm/v1/kv_offload/backend.py +97 -0
  1600. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1601. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1602. vllm/v1/kv_offload/cpu.py +109 -0
  1603. vllm/v1/kv_offload/factory.py +58 -0
  1604. vllm/v1/kv_offload/lru_manager.py +139 -0
  1605. vllm/v1/kv_offload/mediums.py +39 -0
  1606. vllm/v1/kv_offload/spec.py +70 -0
  1607. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1608. vllm/v1/kv_offload/worker/cpu_gpu.py +287 -0
  1609. vllm/v1/kv_offload/worker/worker.py +163 -0
  1610. vllm/v1/metrics/__init__.py +0 -0
  1611. vllm/v1/metrics/loggers.py +1320 -0
  1612. vllm/v1/metrics/perf.py +1244 -0
  1613. vllm/v1/metrics/prometheus.py +82 -0
  1614. vllm/v1/metrics/ray_wrappers.py +194 -0
  1615. vllm/v1/metrics/reader.py +257 -0
  1616. vllm/v1/metrics/stats.py +440 -0
  1617. vllm/v1/outputs.py +242 -0
  1618. vllm/v1/pool/__init__.py +0 -0
  1619. vllm/v1/pool/metadata.py +124 -0
  1620. vllm/v1/request.py +281 -0
  1621. vllm/v1/sample/__init__.py +0 -0
  1622. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1623. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1624. vllm/v1/sample/logits_processor/interface.py +106 -0
  1625. vllm/v1/sample/logits_processor/state.py +165 -0
  1626. vllm/v1/sample/metadata.py +44 -0
  1627. vllm/v1/sample/ops/__init__.py +0 -0
  1628. vllm/v1/sample/ops/bad_words.py +57 -0
  1629. vllm/v1/sample/ops/logprobs.py +25 -0
  1630. vllm/v1/sample/ops/penalties.py +57 -0
  1631. vllm/v1/sample/ops/topk_topp_sampler.py +388 -0
  1632. vllm/v1/sample/rejection_sampler.py +822 -0
  1633. vllm/v1/sample/sampler.py +319 -0
  1634. vllm/v1/sample/tpu/__init__.py +0 -0
  1635. vllm/v1/sample/tpu/metadata.py +120 -0
  1636. vllm/v1/sample/tpu/sampler.py +215 -0
  1637. vllm/v1/serial_utils.py +514 -0
  1638. vllm/v1/spec_decode/__init__.py +0 -0
  1639. vllm/v1/spec_decode/eagle.py +1346 -0
  1640. vllm/v1/spec_decode/medusa.py +73 -0
  1641. vllm/v1/spec_decode/metadata.py +66 -0
  1642. vllm/v1/spec_decode/metrics.py +225 -0
  1643. vllm/v1/spec_decode/ngram_proposer.py +281 -0
  1644. vllm/v1/spec_decode/suffix_decoding.py +95 -0
  1645. vllm/v1/spec_decode/utils.py +109 -0
  1646. vllm/v1/structured_output/__init__.py +337 -0
  1647. vllm/v1/structured_output/backend_guidance.py +291 -0
  1648. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1649. vllm/v1/structured_output/backend_outlines.py +324 -0
  1650. vllm/v1/structured_output/backend_types.py +136 -0
  1651. vllm/v1/structured_output/backend_xgrammar.py +378 -0
  1652. vllm/v1/structured_output/request.py +91 -0
  1653. vllm/v1/structured_output/utils.py +457 -0
  1654. vllm/v1/utils.py +466 -0
  1655. vllm/v1/worker/__init__.py +0 -0
  1656. vllm/v1/worker/block_table.py +343 -0
  1657. vllm/v1/worker/cp_utils.py +42 -0
  1658. vllm/v1/worker/cpu_model_runner.py +122 -0
  1659. vllm/v1/worker/cpu_worker.py +192 -0
  1660. vllm/v1/worker/dp_utils.py +240 -0
  1661. vllm/v1/worker/ec_connector_model_runner_mixin.py +85 -0
  1662. vllm/v1/worker/gpu/README.md +4 -0
  1663. vllm/v1/worker/gpu/__init__.py +0 -0
  1664. vllm/v1/worker/gpu/async_utils.py +98 -0
  1665. vllm/v1/worker/gpu/attn_utils.py +183 -0
  1666. vllm/v1/worker/gpu/block_table.py +222 -0
  1667. vllm/v1/worker/gpu/buffer_utils.py +224 -0
  1668. vllm/v1/worker/gpu/cudagraph_utils.py +264 -0
  1669. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1670. vllm/v1/worker/gpu/input_batch.py +526 -0
  1671. vllm/v1/worker/gpu/metrics/__init__.py +0 -0
  1672. vllm/v1/worker/gpu/metrics/logits.py +42 -0
  1673. vllm/v1/worker/gpu/mm/__init__.py +0 -0
  1674. vllm/v1/worker/gpu/mm/mrope_utils.py +127 -0
  1675. vllm/v1/worker/gpu/model_runner.py +1005 -0
  1676. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1677. vllm/v1/worker/gpu/sample/gumbel.py +106 -0
  1678. vllm/v1/worker/gpu/sample/logit_bias.py +270 -0
  1679. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1680. vllm/v1/worker/gpu/sample/metadata.py +79 -0
  1681. vllm/v1/worker/gpu/sample/min_p.py +58 -0
  1682. vllm/v1/worker/gpu/sample/output.py +14 -0
  1683. vllm/v1/worker/gpu/sample/penalties.py +155 -0
  1684. vllm/v1/worker/gpu/sample/sampler.py +88 -0
  1685. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1686. vllm/v1/worker/gpu/spec_decode/eagle.py +566 -0
  1687. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1688. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +71 -0
  1689. vllm/v1/worker/gpu/states.py +282 -0
  1690. vllm/v1/worker/gpu/structured_outputs.py +100 -0
  1691. vllm/v1/worker/gpu_input_batch.py +1030 -0
  1692. vllm/v1/worker/gpu_model_runner.py +5761 -0
  1693. vllm/v1/worker/gpu_ubatch_wrapper.py +475 -0
  1694. vllm/v1/worker/gpu_worker.py +968 -0
  1695. vllm/v1/worker/kv_connector_model_runner_mixin.py +300 -0
  1696. vllm/v1/worker/lora_model_runner_mixin.py +225 -0
  1697. vllm/v1/worker/tpu_input_batch.py +574 -0
  1698. vllm/v1/worker/tpu_worker.py +18 -0
  1699. vllm/v1/worker/ubatch_utils.py +112 -0
  1700. vllm/v1/worker/ubatching.py +242 -0
  1701. vllm/v1/worker/utils.py +400 -0
  1702. vllm/v1/worker/worker_base.py +372 -0
  1703. vllm/v1/worker/workspace.py +253 -0
  1704. vllm/v1/worker/xpu_model_runner.py +48 -0
  1705. vllm/v1/worker/xpu_worker.py +174 -0
  1706. vllm/version.py +39 -0
  1707. vllm/vllm_flash_attn/.gitkeep +0 -0
  1708. vllm_cpu_avx512bf16-0.14.0.dist-info/METADATA +348 -0
  1709. vllm_cpu_avx512bf16-0.14.0.dist-info/RECORD +1712 -0
  1710. vllm_cpu_avx512bf16-0.14.0.dist-info/WHEEL +5 -0
  1711. vllm_cpu_avx512bf16-0.14.0.dist-info/entry_points.txt +5 -0
  1712. vllm_cpu_avx512bf16-0.14.0.dist-info/top_level.txt +1 -0
vllm/_custom_ops.py ADDED
@@ -0,0 +1,3206 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ from typing import TYPE_CHECKING, Literal
5
+
6
+ import torch
7
+
8
+ import vllm.envs as envs
9
+ from vllm.logger import init_logger
10
+ from vllm.platforms import current_platform
11
+ from vllm.scalar_type import ScalarType
12
+
13
+ logger = init_logger(__name__)
14
+
15
+ current_platform.import_kernels()
16
+
17
+ if TYPE_CHECKING:
18
+
19
+ def register_fake(fn):
20
+ return lambda name: fn
21
+ else:
22
+ try:
23
+ from torch.library import register_fake
24
+ except ImportError:
25
+ from torch.library import impl_abstract as register_fake
26
+
27
+
28
+ # page attention ops
29
+ def paged_attention_v1(
30
+ out: torch.Tensor,
31
+ query: torch.Tensor,
32
+ key_cache: torch.Tensor,
33
+ value_cache: torch.Tensor,
34
+ num_kv_heads: int,
35
+ scale: float,
36
+ block_tables: torch.Tensor,
37
+ seq_lens: torch.Tensor,
38
+ block_size: int,
39
+ max_seq_len: int,
40
+ alibi_slopes: torch.Tensor | None,
41
+ kv_cache_dtype: str,
42
+ k_scale: torch.Tensor,
43
+ v_scale: torch.Tensor,
44
+ tp_rank: int = 0,
45
+ blocksparse_local_blocks: int = 0,
46
+ blocksparse_vert_stride: int = 0,
47
+ blocksparse_block_size: int = 64,
48
+ blocksparse_head_sliding_step: int = 0,
49
+ ) -> None:
50
+ torch.ops._C.paged_attention_v1(
51
+ out,
52
+ query,
53
+ key_cache,
54
+ value_cache,
55
+ num_kv_heads,
56
+ scale,
57
+ block_tables,
58
+ seq_lens,
59
+ block_size,
60
+ max_seq_len,
61
+ alibi_slopes,
62
+ kv_cache_dtype,
63
+ k_scale,
64
+ v_scale,
65
+ tp_rank,
66
+ blocksparse_local_blocks,
67
+ blocksparse_vert_stride,
68
+ blocksparse_block_size,
69
+ blocksparse_head_sliding_step,
70
+ )
71
+
72
+
73
+ def paged_attention_v2(
74
+ out: torch.Tensor,
75
+ exp_sum: torch.Tensor,
76
+ max_logits: torch.Tensor,
77
+ tmp_out: torch.Tensor,
78
+ query: torch.Tensor,
79
+ key_cache: torch.Tensor,
80
+ value_cache: torch.Tensor,
81
+ num_kv_heads: int,
82
+ scale: float,
83
+ block_tables: torch.Tensor,
84
+ seq_lens: torch.Tensor,
85
+ block_size: int,
86
+ max_seq_len: int,
87
+ alibi_slopes: torch.Tensor | None,
88
+ kv_cache_dtype: str,
89
+ k_scale: torch.Tensor,
90
+ v_scale: torch.Tensor,
91
+ tp_rank: int = 0,
92
+ blocksparse_local_blocks: int = 0,
93
+ blocksparse_vert_stride: int = 0,
94
+ blocksparse_block_size: int = 64,
95
+ blocksparse_head_sliding_step: int = 0,
96
+ ) -> None:
97
+ torch.ops._C.paged_attention_v2(
98
+ out,
99
+ exp_sum,
100
+ max_logits,
101
+ tmp_out,
102
+ query,
103
+ key_cache,
104
+ value_cache,
105
+ num_kv_heads,
106
+ scale,
107
+ block_tables,
108
+ seq_lens,
109
+ block_size,
110
+ max_seq_len,
111
+ alibi_slopes,
112
+ kv_cache_dtype,
113
+ k_scale,
114
+ v_scale,
115
+ tp_rank,
116
+ blocksparse_local_blocks,
117
+ blocksparse_vert_stride,
118
+ blocksparse_block_size,
119
+ blocksparse_head_sliding_step,
120
+ )
121
+
122
+
123
+ def paged_attention_rocm(
124
+ out: torch.Tensor,
125
+ exp_sum: torch.Tensor,
126
+ max_logits: torch.Tensor,
127
+ tmp_out: torch.Tensor,
128
+ query: torch.Tensor,
129
+ key_cache: torch.Tensor,
130
+ value_cache: torch.Tensor,
131
+ num_kv_heads: int,
132
+ scale: float,
133
+ block_tables: torch.Tensor,
134
+ seq_lens: torch.Tensor,
135
+ query_start_loc: torch.Tensor | None,
136
+ block_size: int,
137
+ max_seq_len: int,
138
+ alibi_slopes: torch.Tensor | None,
139
+ kv_cache_dtype: str,
140
+ k_scale: torch.Tensor,
141
+ v_scale: torch.Tensor,
142
+ fp8_out_scale: torch.Tensor | None = None,
143
+ mfma_type: str = "fp8" if envs.VLLM_ROCM_FP8_MFMA_PAGE_ATTN else "f16",
144
+ ) -> None:
145
+ torch.ops._rocm_C.paged_attention(
146
+ out,
147
+ exp_sum,
148
+ max_logits,
149
+ tmp_out,
150
+ query,
151
+ key_cache,
152
+ value_cache,
153
+ num_kv_heads,
154
+ scale,
155
+ block_tables,
156
+ seq_lens,
157
+ query_start_loc,
158
+ block_size,
159
+ max_seq_len,
160
+ alibi_slopes,
161
+ kv_cache_dtype,
162
+ k_scale,
163
+ v_scale,
164
+ fp8_out_scale,
165
+ mfma_type,
166
+ )
167
+
168
+
169
+ def mla_decode_kvcache_cpu(
170
+ out: torch.Tensor,
171
+ query: torch.Tensor,
172
+ kv_cache: torch.Tensor,
173
+ scale: float,
174
+ block_tables: torch.Tensor,
175
+ seq_lens: torch.Tensor,
176
+ ) -> None:
177
+ torch.ops._C_cpu.mla_decode_kvcache(
178
+ out, query, kv_cache, scale, block_tables, seq_lens
179
+ )
180
+
181
+
182
+ # merge attn states ops
183
+ def merge_attn_states(
184
+ output: torch.Tensor,
185
+ prefix_output: torch.Tensor,
186
+ prefix_lse: torch.Tensor,
187
+ suffix_output: torch.Tensor,
188
+ suffix_lse: torch.Tensor,
189
+ output_lse: torch.Tensor | None = None,
190
+ ) -> None:
191
+ torch.ops._C.merge_attn_states(
192
+ output, output_lse, prefix_output, prefix_lse, suffix_output, suffix_lse
193
+ )
194
+
195
+
196
+ def convert_vertical_slash_indexes(
197
+ q_seqlens: torch.Tensor, # [BATCH, ]
198
+ kv_seqlens: torch.Tensor, # [BATCH, ]
199
+ vertical_indexes: torch.Tensor, # [BATCH, N_HEADS, NNZ_V]
200
+ slash_indexes: torch.Tensor, # [BATCH, N_HEADS, NNZ_S]
201
+ context_size: int,
202
+ block_size_M: int,
203
+ block_size_N: int,
204
+ causal: bool = True,
205
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
206
+ batch_size = slash_indexes.size(0)
207
+ num_heads = slash_indexes.size(1)
208
+ nnz_slash = slash_indexes.size(2)
209
+ nnz_vertical = vertical_indexes.size(2)
210
+ num_rows = (context_size + block_size_M - 1) // block_size_M
211
+
212
+ block_count = torch.zeros(
213
+ batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
214
+ )
215
+ block_offset = torch.zeros(
216
+ batch_size,
217
+ num_heads,
218
+ num_rows,
219
+ nnz_slash,
220
+ dtype=q_seqlens.dtype,
221
+ device=q_seqlens.device,
222
+ )
223
+ column_count = torch.zeros(
224
+ batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
225
+ )
226
+ column_index = torch.zeros(
227
+ batch_size,
228
+ num_heads,
229
+ num_rows,
230
+ nnz_vertical,
231
+ dtype=q_seqlens.dtype,
232
+ device=q_seqlens.device,
233
+ )
234
+
235
+ torch.ops._C.convert_vertical_slash_indexes(
236
+ block_count,
237
+ block_offset,
238
+ column_count,
239
+ column_index,
240
+ q_seqlens,
241
+ kv_seqlens,
242
+ vertical_indexes,
243
+ slash_indexes,
244
+ context_size,
245
+ block_size_M,
246
+ block_size_N,
247
+ causal,
248
+ )
249
+ return block_count, block_offset, column_count, column_index
250
+
251
+
252
+ def convert_vertical_slash_indexes_mergehead(
253
+ q_seqlens: torch.Tensor, # [BATCH, ]
254
+ kv_seqlens: torch.Tensor, # [BATCH, ]
255
+ vertical_indexes: torch.Tensor, # [BATCH, N_HEADS, NNZ_V]
256
+ slash_indexes: torch.Tensor, # [BATCH, N_HEADS, NNZ_S]
257
+ # [N_HEADS] : different head use different number of indices
258
+ vertical_indices_count: torch.Tensor,
259
+ slash_indices_count: torch.Tensor,
260
+ context_size: int,
261
+ block_size_M: int,
262
+ block_size_N: int,
263
+ causal: bool = True,
264
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
265
+ batch_size = slash_indexes.size(0)
266
+ num_heads = slash_indexes.size(1)
267
+ nnz_slash = slash_indexes.size(2)
268
+ nnz_vertical = vertical_indexes.size(2)
269
+ num_rows = (context_size + block_size_M - 1) // block_size_M
270
+
271
+ block_count = torch.empty(
272
+ batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
273
+ )
274
+ block_offset = torch.empty(
275
+ batch_size,
276
+ num_heads,
277
+ num_rows,
278
+ nnz_slash,
279
+ dtype=q_seqlens.dtype,
280
+ device=q_seqlens.device,
281
+ )
282
+ column_count = torch.empty(
283
+ batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
284
+ )
285
+ column_index = torch.empty(
286
+ batch_size,
287
+ num_heads,
288
+ num_rows,
289
+ nnz_vertical,
290
+ dtype=q_seqlens.dtype,
291
+ device=q_seqlens.device,
292
+ )
293
+
294
+ torch.ops._C.convert_vertical_slash_indexes_mergehead(
295
+ block_count,
296
+ block_offset,
297
+ column_count,
298
+ column_index,
299
+ q_seqlens,
300
+ kv_seqlens,
301
+ vertical_indexes,
302
+ slash_indexes,
303
+ vertical_indices_count,
304
+ slash_indices_count,
305
+ context_size,
306
+ block_size_M,
307
+ block_size_N,
308
+ causal,
309
+ )
310
+ return block_count, block_offset, column_count, column_index
311
+
312
+
313
+ # pos encoding ops
314
+ def rotary_embedding(
315
+ positions: torch.Tensor,
316
+ query: torch.Tensor,
317
+ key: torch.Tensor | None,
318
+ head_size: int,
319
+ cos_sin_cache: torch.Tensor,
320
+ is_neox: bool,
321
+ ) -> None:
322
+ torch.ops._C.rotary_embedding(
323
+ positions, query, key, head_size, cos_sin_cache, is_neox
324
+ )
325
+
326
+
327
+ # layer norm ops
328
+ def rms_norm(
329
+ out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float
330
+ ) -> None:
331
+ torch.ops._C.rms_norm(out, input, weight, epsilon)
332
+
333
+
334
+ def fused_add_rms_norm(
335
+ input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, epsilon: float
336
+ ) -> None:
337
+ torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
338
+
339
+
340
+ def fused_qk_norm_rope(
341
+ qkv: torch.Tensor,
342
+ num_heads_q: int,
343
+ num_heads_k: int,
344
+ num_heads_v: int,
345
+ head_dim: int,
346
+ eps: float,
347
+ q_weight: torch.Tensor,
348
+ k_weight: torch.Tensor,
349
+ cos_sin_cache: torch.Tensor,
350
+ is_neox: bool,
351
+ position_ids: torch.Tensor,
352
+ ) -> None:
353
+ torch.ops._C.fused_qk_norm_rope(
354
+ qkv,
355
+ num_heads_q,
356
+ num_heads_k,
357
+ num_heads_v,
358
+ head_dim,
359
+ eps,
360
+ q_weight,
361
+ k_weight,
362
+ cos_sin_cache,
363
+ is_neox,
364
+ position_ids,
365
+ )
366
+
367
+
368
+ def apply_repetition_penalties_torch(
369
+ logits: torch.Tensor,
370
+ prompt_mask: torch.Tensor,
371
+ output_mask: torch.Tensor,
372
+ repetition_penalties: torch.Tensor,
373
+ ) -> None:
374
+ repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
375
+ 1, logits.size(1)
376
+ )
377
+ # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
378
+ penalties = torch.where(prompt_mask | output_mask, repetition_penalties, 1.0)
379
+ # If logits are positive, divide by penalty, otherwise multiply by penalty.
380
+ scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
381
+ logits *= scaling
382
+
383
+
384
+ def apply_repetition_penalties_cuda(
385
+ logits: torch.Tensor,
386
+ prompt_mask: torch.Tensor,
387
+ output_mask: torch.Tensor,
388
+ repetition_penalties: torch.Tensor,
389
+ ) -> None:
390
+ torch.ops._C.apply_repetition_penalties_(
391
+ logits, prompt_mask, output_mask, repetition_penalties
392
+ )
393
+
394
+
395
+ def apply_repetition_penalties(
396
+ logits: torch.Tensor,
397
+ prompt_mask: torch.Tensor,
398
+ output_mask: torch.Tensor,
399
+ repetition_penalties: torch.Tensor,
400
+ ) -> None:
401
+ """Apply repetition penalties to logits in-place.
402
+
403
+ Args:
404
+ logits: The logits tensor of shape [num_seqs, vocab_size].
405
+ prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
406
+ output_mask: A boolean tensor indicating which tokens appear in the output.
407
+ repetition_penalties: The repetition penalties of shape (num_seqs, ).
408
+ """
409
+ if logits.is_cuda and logits.is_contiguous():
410
+ apply_repetition_penalties_cuda(
411
+ logits, prompt_mask, output_mask, repetition_penalties
412
+ )
413
+ else:
414
+ apply_repetition_penalties_torch(
415
+ logits, prompt_mask, output_mask, repetition_penalties
416
+ )
417
+
418
+
419
+ # fused quant layer norm ops
420
+ def rms_norm_dynamic_per_token_quant(
421
+ input: torch.Tensor,
422
+ weight: torch.Tensor,
423
+ epsilon: float,
424
+ quant_dtype: torch.dtype,
425
+ scale_ub: torch.Tensor | None = None,
426
+ residual: torch.Tensor | None = None,
427
+ ) -> tuple[torch.Tensor, torch.Tensor]:
428
+ output = torch.empty_like(input, dtype=quant_dtype)
429
+ scales = torch.empty(
430
+ (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
431
+ )
432
+
433
+ torch.ops._C.rms_norm_dynamic_per_token_quant(
434
+ output, input, weight, scales, epsilon, scale_ub, residual
435
+ )
436
+ return output, scales
437
+
438
+
439
+ # fused quant layer norm ops blocked
440
+ def rms_norm_per_block_quant(
441
+ input: torch.Tensor,
442
+ weight: torch.Tensor,
443
+ epsilon: float,
444
+ quant_dtype: torch.dtype,
445
+ group_size: list[int],
446
+ scale_ub: torch.Tensor | None = None,
447
+ residual: torch.Tensor | None = None,
448
+ is_scale_transposed: bool = False,
449
+ ) -> tuple[torch.Tensor, torch.Tensor]:
450
+ assert len(group_size) == 2
451
+ output = torch.empty_like(input, dtype=quant_dtype)
452
+ if is_scale_transposed:
453
+ scales = torch.empty(
454
+ (input.shape[-1] // group_size[1], input.numel() // input.shape[-1]),
455
+ device=input.device,
456
+ dtype=torch.float32,
457
+ ).transpose(0, 1)
458
+ else:
459
+ scales = torch.empty(
460
+ (input.numel() // input.shape[-1], input.shape[-1] // group_size[1]),
461
+ device=input.device,
462
+ dtype=torch.float32,
463
+ )
464
+
465
+ torch.ops._C.rms_norm_per_block_quant(
466
+ output,
467
+ input,
468
+ weight,
469
+ scales,
470
+ epsilon,
471
+ scale_ub,
472
+ residual,
473
+ group_size[1],
474
+ is_scale_transposed,
475
+ )
476
+ return output, scales
477
+
478
+
479
+ # quantization ops
480
+ # awq
481
+ def awq_dequantize(
482
+ qweight: torch.Tensor,
483
+ scales: torch.Tensor,
484
+ zeros: torch.Tensor,
485
+ split_k_iters: int,
486
+ thx: int,
487
+ thy: int,
488
+ ) -> torch.Tensor:
489
+ if envs.VLLM_USE_TRITON_AWQ:
490
+ from vllm.model_executor.layers.quantization.awq_triton import (
491
+ awq_dequantize_triton,
492
+ )
493
+
494
+ return awq_dequantize_triton(qweight, scales, zeros)
495
+ return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters, thx, thy)
496
+
497
+
498
+ def awq_gemm(
499
+ input: torch.Tensor,
500
+ qweight: torch.Tensor,
501
+ scales: torch.Tensor,
502
+ qzeros: torch.Tensor,
503
+ split_k_iters: int,
504
+ ) -> torch.Tensor:
505
+ if envs.VLLM_USE_TRITON_AWQ:
506
+ from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton
507
+
508
+ return awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters)
509
+ return torch.ops._C.awq_gemm(input, qweight, scales, qzeros, split_k_iters)
510
+
511
+
512
+ # gptq
513
+ def gptq_gemm(
514
+ a: torch.Tensor,
515
+ b_q_weight: torch.Tensor,
516
+ b_gptq_qzeros: torch.Tensor,
517
+ b_gptq_scales: torch.Tensor,
518
+ b_g_idx: torch.Tensor,
519
+ use_exllama: bool,
520
+ use_v2_format: bool,
521
+ bit: int,
522
+ ) -> torch.Tensor:
523
+ return torch.ops._C.gptq_gemm(
524
+ a,
525
+ b_q_weight,
526
+ b_gptq_qzeros,
527
+ b_gptq_scales,
528
+ b_g_idx,
529
+ use_exllama,
530
+ use_v2_format,
531
+ bit,
532
+ )
533
+
534
+
535
+ if hasattr(torch.ops._C, "gptq_gemm"):
536
+
537
+ @register_fake("_C::gptq_gemm")
538
+ def _gptq_gemm_fake(
539
+ a: torch.Tensor,
540
+ b_q_weight: torch.Tensor,
541
+ b_gptq_qzeros: torch.Tensor,
542
+ b_gptq_scales: torch.Tensor,
543
+ b_g_idx: torch.Tensor,
544
+ use_exllama: bool,
545
+ use_v2_format: bool,
546
+ bit: int,
547
+ ) -> torch.Tensor:
548
+ return torch.empty(
549
+ (a.size(0), b_q_weight.size(1)), dtype=a.dtype, device=a.device
550
+ )
551
+
552
+
553
+ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, bit: int) -> None:
554
+ torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
555
+
556
+
557
+ # marlin_24
558
+ def gptq_marlin_24_gemm(
559
+ a: torch.Tensor,
560
+ b_q_weight: torch.Tensor,
561
+ b_meta: torch.Tensor,
562
+ b_scales: torch.Tensor,
563
+ workspace: torch.Tensor,
564
+ b_q_type: ScalarType,
565
+ size_m: int,
566
+ size_n: int,
567
+ size_k: int,
568
+ ) -> torch.Tensor:
569
+ return torch.ops._C.gptq_marlin_24_gemm(
570
+ a, b_q_weight, b_meta, b_scales, workspace, b_q_type.id, size_m, size_n, size_k
571
+ )
572
+
573
+
574
+ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
575
+
576
+ @register_fake("_C::gptq_marlin_24_gemm")
577
+ def _gptq_marlin_24_gemm_fake(
578
+ a: torch.Tensor,
579
+ b_q_weight: torch.Tensor,
580
+ b_meta: torch.Tensor,
581
+ b_scales: torch.Tensor,
582
+ workspace: torch.Tensor,
583
+ b_q_type: ScalarType,
584
+ size_m: torch.SymInt,
585
+ size_n: torch.SymInt,
586
+ size_k: torch.SymInt,
587
+ ) -> torch.Tensor:
588
+ return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
589
+
590
+ @register_fake("_C::gptq_marlin_gemm")
591
+ def _gptq_marlin_gemm_fake(
592
+ a: torch.Tensor,
593
+ c: torch.Tensor | None,
594
+ b_q_weight: torch.Tensor,
595
+ b_bias: torch.Tensor | None,
596
+ b_scales: torch.Tensor,
597
+ a_scales: torch.Tensor | None,
598
+ global_scale: torch.Tensor | None,
599
+ b_zeros: torch.Tensor | None,
600
+ g_idx: torch.Tensor | None,
601
+ perm: torch.Tensor | None,
602
+ workspace: torch.Tensor,
603
+ b_q_type_id: int,
604
+ size_m: torch.SymInt,
605
+ size_n: torch.SymInt,
606
+ size_k: torch.SymInt,
607
+ is_k_full: bool = True,
608
+ use_atomic_add: bool = False,
609
+ use_fp32_reduce: bool = False,
610
+ is_zp_float: bool = False,
611
+ ) -> torch.Tensor:
612
+ dtype = a.dtype
613
+ if dtype not in [torch.half, torch.bfloat16]:
614
+ dtype = b_scales.dtype
615
+ return torch.empty((size_m, size_n), device=a.device, dtype=dtype)
616
+
617
+ @register_fake("_C::awq_dequantize")
618
+ def _awq_dequantize_fake(
619
+ qweight: torch.Tensor,
620
+ scales: torch.Tensor,
621
+ zeros: torch.Tensor,
622
+ split_k_iters: torch.SymInt,
623
+ thx: int,
624
+ thy: int,
625
+ ) -> torch.Tensor:
626
+ in_c = qweight.size(0)
627
+ qout_c = qweight.size(1)
628
+ out_c = qout_c * 8
629
+ return torch.empty((in_c, out_c), dtype=scales.dtype, device=scales.device)
630
+
631
+ @register_fake("_C::awq_gemm")
632
+ def _awq_gemm_fake(
633
+ input: torch.Tensor,
634
+ qweight: torch.Tensor,
635
+ scales: torch.Tensor,
636
+ qzeros: torch.Tensor,
637
+ split_k_iters: torch.SymInt,
638
+ ) -> torch.Tensor:
639
+ num_in_feats = input.size(0)
640
+ return torch.empty(
641
+ (split_k_iters, num_in_feats, qweight.size(1) * 8),
642
+ dtype=input.dtype,
643
+ device=input.device,
644
+ ).sum(0)
645
+
646
+ @register_fake("_C::machete_mm")
647
+ def machete_mm_fake(
648
+ a: torch.Tensor,
649
+ # b_q Should be the tensor returned by machete_prepack_B
650
+ b_q: torch.Tensor,
651
+ b_type: ScalarType,
652
+ out_type: torch.dtype | None = None,
653
+ b_group_scales: torch.Tensor | None = None,
654
+ b_group_zeros: torch.Tensor | None = None,
655
+ b_group_size: int | None = None,
656
+ b_channel_scales: torch.Tensor | None = None,
657
+ a_token_scales: torch.Tensor | None = None,
658
+ schedule: str | None = None,
659
+ ) -> torch.Tensor:
660
+ m = a.size(0)
661
+ n = b_q.size(1)
662
+ return torch.empty((m, n), device=a.device, dtype=a.dtype)
663
+
664
+ @register_fake("_C::machete_prepack_B")
665
+ def machete_prepack_B_fake(
666
+ b_q_weight: torch.Tensor,
667
+ a_type: torch.dtype,
668
+ b_type: ScalarType,
669
+ group_scales_type: torch.dtype | None,
670
+ ) -> torch.Tensor:
671
+ return torch.empty_like(b_q_weight, memory_format=torch.contiguous_format)
672
+
673
+ @register_fake("_C::cutlass_w4a8_mm")
674
+ def cutlass_w4a8_mm_fake(
675
+ a: torch.Tensor,
676
+ # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
677
+ b_q: torch.Tensor,
678
+ b_group_scales: torch.Tensor,
679
+ b_group_size: int,
680
+ b_channel_scales: torch.Tensor,
681
+ a_token_scales: torch.Tensor,
682
+ out_type: torch.dtype | None = None,
683
+ maybe_schedule: str | None = None,
684
+ ) -> torch.Tensor:
685
+ m = a.size(0)
686
+ n = b_q.size(1)
687
+ out_dtype = out_type if out_type is not None else torch.bfloat16
688
+ return torch.empty((m, n), device=a.device, dtype=out_dtype)
689
+
690
+ @register_fake("_C::cutlass_pack_scale_fp8")
691
+ def cutlass_pack_scale_fp8_fake(scales: torch.Tensor) -> torch.Tensor:
692
+ return torch.empty_like(scales, memory_format=torch.contiguous_format)
693
+
694
+ @register_fake("_C::cutlass_encode_and_reorder_int4b")
695
+ def cutlass_encode_and_reorder_int4b_fake(b: torch.Tensor) -> torch.Tensor:
696
+ return torch.empty_like(b, memory_format=torch.contiguous_format)
697
+
698
+ @register_fake("_C::cutlass_encode_and_reorder_int4b_grouped")
699
+ def cutlass_encode_and_reorder_int4b_grouped_fake(b: torch.Tensor) -> torch.Tensor:
700
+ return torch.empty_like(b, memory_format=torch.contiguous_format)
701
+
702
+
703
+ if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
704
+
705
+ @register_fake("_C::allspark_w8a16_gemm")
706
+ def _allspark_w8a16_gemm_fake(
707
+ a: torch.Tensor,
708
+ b_qweight: torch.Tensor,
709
+ b_scales: torch.Tensor,
710
+ b_qzeros: torch.Tensor | None,
711
+ n: torch.SymInt,
712
+ group_size: torch.SymInt,
713
+ sm_count: torch.SymInt,
714
+ sm_version: torch.SymInt,
715
+ CUBLAS_M_THRESHOLD: torch.SymInt,
716
+ has_zp: bool,
717
+ n32k16_reorder: bool,
718
+ ) -> torch.Tensor:
719
+ m = a.size(0)
720
+ return torch.empty((m, n), device=a.device, dtype=a.dtype)
721
+
722
+
723
+ if hasattr(torch.ops._C, "ggml_dequantize"):
724
+
725
+ @register_fake("_C::ggml_dequantize")
726
+ def _ggml_dequantize_fake(
727
+ W: torch.Tensor,
728
+ quant_type: int,
729
+ m: torch.SymInt,
730
+ n: torch.SymInt,
731
+ dtype: torch.dtype | None = None,
732
+ ) -> torch.Tensor:
733
+ return torch.empty((m, n), dtype=torch.float16, device=W.device)
734
+
735
+ @register_fake("_C::ggml_mul_mat_vec_a8")
736
+ def _ggml_mul_mat_vec_a8_fake(
737
+ W: torch.Tensor,
738
+ X: torch.Tensor,
739
+ quant_type: int,
740
+ row: torch.SymInt,
741
+ ) -> torch.Tensor:
742
+ return torch.empty((X.shape[0], row), dtype=X.dtype, device=W.device)
743
+
744
+ @register_fake("_C::ggml_mul_mat_a8")
745
+ def _ggml_mul_mat_a8_fake(
746
+ W: torch.Tensor,
747
+ X: torch.Tensor,
748
+ quant_type: int,
749
+ row: torch.SymInt,
750
+ ) -> torch.Tensor:
751
+ batch = X.size(0)
752
+ return torch.empty((batch, row), dtype=X.dtype, device=W.device)
753
+
754
+ @register_fake("_C::ggml_moe_a8")
755
+ def _ggml_moe_a8_fake(
756
+ X: torch.Tensor,
757
+ W: torch.Tensor,
758
+ sorted_token_ids: torch.Tensor,
759
+ expert_ids: torch.Tensor,
760
+ num_tokens_post_padded: torch.Tensor,
761
+ quant_type: int,
762
+ row: torch.SymInt,
763
+ top_k: torch.SymInt,
764
+ tokens: torch.SymInt,
765
+ ) -> torch.Tensor:
766
+ tokens = X.size(0)
767
+ return torch.empty((tokens * top_k, row), dtype=torch.float16, device=W.device)
768
+
769
+
770
+ if hasattr(torch.ops._C, "ggml_moe_a8_vec"):
771
+
772
+ @register_fake("_C::ggml_moe_a8_vec")
773
+ def _ggml_moe_a8_vec_fake(
774
+ X: torch.Tensor,
775
+ W: torch.Tensor,
776
+ topk_ids: torch.Tensor,
777
+ top_k: int,
778
+ quant_type: int,
779
+ row: torch.SymInt,
780
+ tokens: torch.SymInt,
781
+ ) -> torch.Tensor:
782
+ tokens = X.size(0)
783
+ return torch.empty((tokens * top_k, row), dtype=X.dtype, device=W.device)
784
+
785
+
786
+ # cutlass
787
+ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
788
+ return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
789
+
790
+
791
+ def cutlass_scaled_fp4_mm(
792
+ a: torch.Tensor,
793
+ b: torch.Tensor,
794
+ block_scale_a: torch.Tensor,
795
+ block_scale_b: torch.Tensor,
796
+ alpha: torch.Tensor,
797
+ out_dtype: torch.dtype,
798
+ ) -> torch.Tensor:
799
+ assert a.ndim == 2 and b.ndim == 2
800
+ m, n = a.shape[0], b.shape[0]
801
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
802
+ torch.ops._C.cutlass_scaled_fp4_mm(out, a, b, block_scale_a, block_scale_b, alpha)
803
+ return out
804
+
805
+
806
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
807
+ return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
808
+
809
+
810
+ def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool:
811
+ return torch.ops._C.cutlass_scaled_mm_supports_block_fp8(cuda_device_capability)
812
+
813
+
814
+ def cutlass_scaled_mm(
815
+ a: torch.Tensor,
816
+ b: torch.Tensor,
817
+ scale_a: torch.Tensor,
818
+ scale_b: torch.Tensor,
819
+ out_dtype: torch.dtype,
820
+ bias: torch.Tensor | None = None,
821
+ ) -> torch.Tensor:
822
+ """
823
+ `cutlass_scaled_mm` implements a fused version of
824
+ `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
825
+ where scale_a * a and scale_b * b are implemented using numpy-style
826
+ broadcasting.
827
+
828
+ In order to support blockwise scaling like found in DeepSeek V3 we also
829
+ support extended "group" broadcast rules. We extend the numpy-style
830
+ broadcasting rules with the following rule:
831
+ "if the extent of a dimension in the source shape is between 1 and
832
+ corresponding extent in the target shape we repeat each element along
833
+ that dimension src_shape[dim] // target_shape[dim] times consecutively"
834
+ example if we have:
835
+ a = [[1, 2], and target_shape = (2, 4)
836
+ [3, 4]]
837
+ then we would expand a to:
838
+ a = [[1, 1, 2, 2],
839
+ [3, 3, 4, 4]]
840
+ currently we only support the case:
841
+ scale_a.shape * [1, 128] == a.shape
842
+ scale_b.shape * [128, 128] == b.shape
843
+ """
844
+ assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
845
+ assert bias is None or bias.numel() == b.shape[1] and bias.dtype == out_dtype
846
+
847
+ # Massage the input to be 2D
848
+ target_shape = (*a.shape[:-1], b.shape[1])
849
+ a = a.view(-1, a.shape[-1])
850
+
851
+ cutlass_compatible_b = b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
852
+ if current_platform.is_rocm() or not cutlass_compatible_b:
853
+ from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import ( # noqa
854
+ triton_scaled_mm,
855
+ )
856
+
857
+ out = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
858
+ else:
859
+ out = torch.empty((a.shape[0], b.shape[1]), dtype=out_dtype, device=a.device)
860
+ torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
861
+
862
+ return out.view(*target_shape)
863
+
864
+
865
+ def cutlass_scaled_mm_azp(
866
+ a: torch.Tensor,
867
+ b: torch.Tensor,
868
+ scale_a: torch.Tensor,
869
+ scale_b: torch.Tensor,
870
+ out_dtype: torch.dtype,
871
+ azp_adj: torch.Tensor,
872
+ azp: torch.Tensor | None = None,
873
+ bias: torch.Tensor | None = None,
874
+ ) -> torch.Tensor:
875
+ """
876
+ :param azp_adj: In the per-tensor case, this should include the azp.
877
+ Always per-channel.
878
+ :param azp: Only set in the per-token case. Per-token if set.
879
+ """
880
+ assert b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
881
+ assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
882
+ assert bias is None or bias.numel() == b.shape[1] and bias.dtype == out_dtype
883
+
884
+ # Massage the input to be 2D
885
+ target_shape = (*a.shape[:-1], b.shape[1])
886
+ a = a.view(-1, a.shape[-1])
887
+ assert azp is None or azp.numel() == a.shape[0]
888
+
889
+ out = torch.empty((a.shape[0], b.shape[1]), dtype=out_dtype, device=a.device)
890
+ torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj, azp, bias)
891
+ return out.view(*target_shape)
892
+
893
+
894
+ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
895
+ return torch.ops._C.cutlass_sparse_scaled_mm_supported(cuda_device_capability)
896
+
897
+
898
+ def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
899
+ try:
900
+ return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
901
+ except AttributeError:
902
+ # Return False on non-CUDA platforms where it is not available
903
+ return False
904
+
905
+
906
+ def cutlass_sparse_compress(a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
907
+ """
908
+ Compresses a sparse matrix for use with Cutlass sparse operations.
909
+
910
+ This function takes a dense tensor and compresses it into two components:
911
+ non-zero elements and metadata. The compressed representation is compatible
912
+ with Cutlass sparse kernels.
913
+
914
+ Args:
915
+ a (torch.Tensor):
916
+ The input tensor to be compressed. Must have one of the following data types:
917
+ - `torch.int8`
918
+ - `torch.float8_e4m3fn`
919
+ - `torch.bfloat16`
920
+ - `torch.float16`
921
+
922
+ Returns:
923
+ tuple[torch.Tensor, torch.Tensor]:
924
+ A tuple containing:
925
+ - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
926
+ - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
927
+
928
+ Raises:
929
+ ValueError: If the compression operation fails.
930
+
931
+ Notes:
932
+ - The `a_meta` tensor has a data type of `torch.uint8`.
933
+ - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
934
+ - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
935
+ - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
936
+ """
937
+ assert a.dtype in [torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16]
938
+ assert a.is_contiguous()
939
+
940
+ # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
941
+ elemsPerMetaElem = 4
942
+ assert a.shape[1] % (2 * elemsPerMetaElem) == 0
943
+
944
+ return torch.ops._C.cutlass_sparse_compress(a)
945
+
946
+
947
+ def cutlass_scaled_sparse_mm(
948
+ a: torch.Tensor,
949
+ bt_nzs: torch.Tensor,
950
+ bt_meta: torch.Tensor,
951
+ scale_a: torch.Tensor,
952
+ scale_b: torch.Tensor,
953
+ out_dtype: torch.dtype,
954
+ bias: torch.Tensor | None = None,
955
+ ) -> torch.Tensor:
956
+ """
957
+ Performs a scaled sparse matrix multiplication using Cutlass.
958
+
959
+ Steps:
960
+ 1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
961
+ `a = torch.randn((m, k), device='cuda')`.
962
+
963
+ 2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
964
+ `b = torch.randn((k, n), device='cuda')`.
965
+
966
+ 3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
967
+ `b = prune_to_2_4(b, dim=0)`.
968
+
969
+ 4. Compress the transposed sparse matrix `b.t()`:
970
+ `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
971
+
972
+ 5. Perform sparse matrix multiplication using the compressed matrix,
973
+ applying scaling factors for `a` and `b`, and the output data type:
974
+ `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
975
+
976
+ Returns:
977
+ - The result of the scaled sparse matrix multiplication.
978
+ """
979
+ assert bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0
980
+ assert out_dtype is torch.bfloat16 or out_dtype is torch.float16
981
+ assert bias is None or bias.shape[0] == bt_nzs.shape[0] and bias.dtype == out_dtype
982
+
983
+ m = a.shape[0]
984
+ n = bt_nzs.shape[0]
985
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
986
+
987
+ torch.ops._C.cutlass_scaled_sparse_mm(
988
+ out, a, bt_nzs, bt_meta, scale_a, scale_b, bias
989
+ )
990
+
991
+ return out
992
+
993
+
994
+ def get_cutlass_moe_mm_data(
995
+ topk_ids: torch.Tensor,
996
+ expert_offsets: torch.Tensor,
997
+ problem_sizes1: torch.Tensor,
998
+ problem_sizes2: torch.Tensor,
999
+ input_permutation: torch.Tensor,
1000
+ output_permutation: torch.Tensor,
1001
+ num_experts: int,
1002
+ n: int,
1003
+ k: int,
1004
+ blockscale_offsets: torch.Tensor | None = None,
1005
+ ):
1006
+ """
1007
+ Prepare data necessary to perform CUTLASS grouped matrix multiplications
1008
+ used in CUTLASS-based fused MoE.
1009
+
1010
+ The function takes in topk_ids (token-expert mapping) and uses it to
1011
+ compute:
1012
+ - expert_offsets: Indices that mark at which token index each expert begins
1013
+ its computation after the input is sorted with
1014
+ input_permutation. The number of tokens computed with
1015
+ expert E is expert_offsets[E + 1] - expert_offsets[E]
1016
+ - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
1017
+ multiplication in two grouped MMs used in
1018
+ the fused MoE operation.
1019
+ - input_permutation: Permutation that must be used to shuffle the input
1020
+ before executing the MMs.
1021
+ - output_permutation: Permutation that must be used to shuffle the output
1022
+ after executing the MMs.
1023
+ - blockscale_offsets: Optional argument passed for fp4 moe. Indices that
1024
+ mark at which block scale index each expert begins
1025
+ its computation. The number of block scale rows
1026
+ computed with expert E is blockscale_offsets[E + 1] -
1027
+ blockscale_offsets[E]
1028
+ """
1029
+ return torch.ops._C.get_cutlass_moe_mm_data(
1030
+ topk_ids,
1031
+ expert_offsets,
1032
+ problem_sizes1,
1033
+ problem_sizes2,
1034
+ input_permutation,
1035
+ output_permutation,
1036
+ num_experts,
1037
+ n,
1038
+ k,
1039
+ blockscale_offsets,
1040
+ )
1041
+
1042
+
1043
+ def get_cutlass_moe_mm_problem_sizes(
1044
+ topk_ids: torch.Tensor,
1045
+ problem_sizes1: torch.Tensor,
1046
+ problem_sizes2: torch.Tensor,
1047
+ num_experts: int,
1048
+ n: int,
1049
+ k: int,
1050
+ blockscale_offsets: torch.Tensor | None = None,
1051
+ force_swap_ab: bool | None = None,
1052
+ ):
1053
+ """
1054
+ Compute only the per-expert problem sizes needed by the two grouped matrix
1055
+ multiplications used in CUTLASS-based fused MoE.
1056
+
1057
+ The function takes in topk_ids (token→expert mapping) and computes:
1058
+ - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
1059
+ multiplication for the two grouped MMs
1060
+ used in the fused MoE operation.
1061
+ Optional:
1062
+ - force_swap_ab: If set to True or False, explicitly enable or disable the
1063
+ A/B input swap optimization. If None (default), the swap
1064
+ is selected automatically based on tensor sizes.
1065
+ """
1066
+ return torch.ops._C.get_cutlass_moe_mm_problem_sizes(
1067
+ topk_ids,
1068
+ problem_sizes1,
1069
+ problem_sizes2,
1070
+ num_experts,
1071
+ n,
1072
+ k,
1073
+ blockscale_offsets,
1074
+ force_swap_ab,
1075
+ )
1076
+
1077
+
1078
+ def get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
1079
+ expert_first_token_offset: torch.Tensor,
1080
+ problem_sizes1: torch.Tensor,
1081
+ problem_sizes2: torch.Tensor,
1082
+ n: int,
1083
+ k: int,
1084
+ swap_ab: bool,
1085
+ ):
1086
+ """Compute per-expert (M, N, K) problem sizes from expert_first_token_offset"""
1087
+ return torch.ops._C.get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
1088
+ expert_first_token_offset,
1089
+ problem_sizes1,
1090
+ problem_sizes2,
1091
+ n,
1092
+ k,
1093
+ swap_ab,
1094
+ )
1095
+
1096
+
1097
+ def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
1098
+ """
1099
+ Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
1100
+ This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
1101
+ """
1102
+ num_tokens_permuted = dst2src_map.shape[0]
1103
+ output_tensor = torch.empty(
1104
+ (num_tokens_permuted, input_tensor.shape[1]),
1105
+ device=input_tensor.device,
1106
+ dtype=input_tensor.dtype,
1107
+ )
1108
+ torch.ops._moe_C.shuffle_rows(input_tensor, dst2src_map, output_tensor)
1109
+ return output_tensor
1110
+
1111
+
1112
+ def get_cutlass_pplx_moe_mm_data(
1113
+ expert_offsets: torch.Tensor,
1114
+ problem_sizes1: torch.Tensor,
1115
+ problem_sizes2: torch.Tensor,
1116
+ expert_num_tokens: torch.Tensor,
1117
+ num_local_experts: int,
1118
+ padded_m: int,
1119
+ n: int,
1120
+ k: int,
1121
+ ):
1122
+ """
1123
+ Prepare data necessary to perform CUTLASS grouped matrix multiplications
1124
+ used in CUTLASS-based fused MoE.
1125
+
1126
+ The function takes in expert_num_tokens (token count per expert) and
1127
+ non_zero_expert_idxs (consecutive indices of experts with non-zero token
1128
+ counts) and uses them to compute:
1129
+ - expert_offsets: Indices that mark at which token index each expert begins
1130
+ its computation.
1131
+ - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
1132
+ multiplication in two grouped MMs used in
1133
+ the fused MoE operation.
1134
+ """
1135
+ return torch.ops._C.get_cutlass_pplx_moe_mm_data(
1136
+ expert_offsets,
1137
+ problem_sizes1,
1138
+ problem_sizes2,
1139
+ expert_num_tokens,
1140
+ num_local_experts,
1141
+ padded_m,
1142
+ n,
1143
+ k,
1144
+ )
1145
+
1146
+
1147
+ def cutlass_moe_mm(
1148
+ out_tensors: torch.Tensor,
1149
+ a_tensors: torch.Tensor,
1150
+ b_tensors: torch.Tensor,
1151
+ a_scales: torch.Tensor,
1152
+ b_scales: torch.Tensor,
1153
+ expert_offsets: torch.Tensor,
1154
+ problem_sizes: torch.Tensor,
1155
+ a_strides: torch.Tensor,
1156
+ b_strides: torch.Tensor,
1157
+ c_strides: torch.Tensor,
1158
+ per_act_token: bool,
1159
+ per_out_ch: bool,
1160
+ ):
1161
+ """
1162
+ A single grouped matrix multiplication used in CUTLASS-based fused MoE.
1163
+ The function executes fp8-quantized OUT = AB matrix multiplication.
1164
+
1165
+ - expert_offsets: Indices that mark at which token index each expert begins
1166
+ its computation. The number of tokens computed with
1167
+ expert E is expert_offsets[E + 1] - expert_offsets[E]
1168
+ - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
1169
+ MMs used in the fused MoE operation.
1170
+ - a/b/c_strides: The data strides passed to grouped matrix multiplication.
1171
+ """
1172
+ return torch.ops._C.cutlass_moe_mm(
1173
+ out_tensors,
1174
+ a_tensors,
1175
+ b_tensors,
1176
+ a_scales,
1177
+ b_scales,
1178
+ expert_offsets,
1179
+ problem_sizes,
1180
+ a_strides,
1181
+ b_strides,
1182
+ c_strides,
1183
+ per_act_token,
1184
+ per_out_ch,
1185
+ )
1186
+
1187
+
1188
+ def cutlass_fp4_moe_mm(
1189
+ out_tensors: torch.Tensor,
1190
+ a_tensors: torch.Tensor,
1191
+ b_tensors: torch.Tensor,
1192
+ a_scales: torch.Tensor,
1193
+ b_scales: torch.Tensor,
1194
+ alphas: torch.Tensor,
1195
+ problem_sizes: torch.Tensor,
1196
+ expert_offsets: torch.Tensor,
1197
+ sf_offsets: torch.Tensor,
1198
+ ):
1199
+ """
1200
+ An FP4 Blockscaled Group Gemm that takes in a_tensors, b_tensors and runs
1201
+ the gemms for each combination based on the specified problem sizes.
1202
+
1203
+ This is used as the MoE gemm during NVFP4 Quantized FusedMoE forward.
1204
+ - a/b_tensors: the NVFP4 a_ptrs and b_ptrs tensors which are quantized
1205
+ input and expert weights.
1206
+ - a_/b_scales: The blockscales in FP8-E4M3 precision
1207
+ - expert_offsets/sf_offsets: Indices that mark at which token index
1208
+ each expert begins its computation. The number of tokens
1209
+ computed with expert E is expert_offsets[E + 1] -
1210
+ expert_offsets[E] And the sf_size per expert is
1211
+ sf_offset[E+1] - sf_offset[E]
1212
+ - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
1213
+ MMs used in the fused MoE operation.
1214
+ """
1215
+ return torch.ops._C.cutlass_fp4_group_mm(
1216
+ out_tensors,
1217
+ a_tensors,
1218
+ b_tensors,
1219
+ a_scales,
1220
+ b_scales,
1221
+ alphas,
1222
+ problem_sizes,
1223
+ expert_offsets,
1224
+ sf_offsets,
1225
+ )
1226
+
1227
+
1228
+ # gptq_marlin
1229
+ def gptq_marlin_repack(
1230
+ b_q_weight: torch.Tensor,
1231
+ perm: torch.Tensor,
1232
+ size_k: int,
1233
+ size_n: int,
1234
+ num_bits: int,
1235
+ is_a_8bit: bool = False,
1236
+ ) -> torch.Tensor:
1237
+ return torch.ops._C.gptq_marlin_repack(
1238
+ b_q_weight, perm, size_k, size_n, num_bits, is_a_8bit
1239
+ )
1240
+
1241
+
1242
+ if hasattr(torch.ops._C, "gptq_marlin_repack"):
1243
+
1244
+ @register_fake("_C::gptq_marlin_repack")
1245
+ def _gptq_marlin_repack_fake(
1246
+ b_q_weight: torch.Tensor,
1247
+ perm: torch.Tensor,
1248
+ size_k: torch.SymInt,
1249
+ size_n: torch.SymInt,
1250
+ num_bits: int,
1251
+ is_a_8bit: bool = False,
1252
+ ) -> torch.Tensor:
1253
+ pack_factor = 32 // num_bits
1254
+ marlin_tile_size = 16
1255
+ return torch.empty(
1256
+ (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
1257
+ dtype=b_q_weight.dtype,
1258
+ device=b_q_weight.device,
1259
+ )
1260
+
1261
+
1262
+ # awq_marlin
1263
+ def awq_marlin_repack(
1264
+ b_q_weight: torch.Tensor,
1265
+ size_k: int,
1266
+ size_n: int,
1267
+ num_bits: int,
1268
+ is_a_8bit: bool = False,
1269
+ ) -> torch.Tensor:
1270
+ return torch.ops._C.awq_marlin_repack(
1271
+ b_q_weight, size_k, size_n, num_bits, is_a_8bit
1272
+ )
1273
+
1274
+
1275
+ if hasattr(torch.ops._C, "awq_marlin_repack"):
1276
+
1277
+ @register_fake("_C::awq_marlin_repack")
1278
+ def _awq_marlin_repack_fake(
1279
+ b_q_weight: torch.Tensor,
1280
+ size_k: torch.SymInt,
1281
+ size_n: torch.SymInt,
1282
+ num_bits: int,
1283
+ is_a_8bit: bool = False,
1284
+ ) -> torch.Tensor:
1285
+ pack_factor = 32 // num_bits
1286
+ marlin_tile_size = 16
1287
+ return torch.empty(
1288
+ (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
1289
+ dtype=b_q_weight.dtype,
1290
+ device=b_q_weight.device,
1291
+ )
1292
+
1293
+
1294
+ def gptq_marlin_moe_repack(
1295
+ b_q_weight: torch.Tensor,
1296
+ perm: torch.Tensor,
1297
+ size_k: int,
1298
+ size_n: int,
1299
+ num_bits: int,
1300
+ is_a_8bit: bool = False,
1301
+ ) -> torch.Tensor:
1302
+ num_experts = b_q_weight.shape[0]
1303
+ assert size_k % 16 == 0
1304
+ output = torch.empty(
1305
+ (num_experts, size_k // 16, size_n * (num_bits // 2)),
1306
+ device=b_q_weight.device,
1307
+ dtype=b_q_weight.dtype,
1308
+ )
1309
+ for e in range(num_experts):
1310
+ output[e] = torch.ops._C.gptq_marlin_repack(
1311
+ b_q_weight[e], perm[e], size_k, size_n, num_bits, is_a_8bit
1312
+ )
1313
+ return output
1314
+
1315
+
1316
+ def awq_marlin_moe_repack(
1317
+ b_q_weight: torch.Tensor,
1318
+ perm: torch.Tensor,
1319
+ size_k: int,
1320
+ size_n: int,
1321
+ num_bits: int,
1322
+ is_a_8bit: bool = False,
1323
+ ) -> torch.Tensor:
1324
+ num_experts = b_q_weight.shape[0]
1325
+ assert size_k % 16 == 0
1326
+ output = torch.empty(
1327
+ (num_experts, size_k // 16, size_n * (num_bits // 2)),
1328
+ device=b_q_weight.device,
1329
+ dtype=b_q_weight.dtype,
1330
+ )
1331
+ for e in range(num_experts):
1332
+ output[e] = torch.ops._C.awq_marlin_repack(
1333
+ b_q_weight[e], size_k, size_n, num_bits, is_a_8bit
1334
+ )
1335
+ return output
1336
+
1337
+
1338
+ def marlin_int4_fp8_preprocess(
1339
+ qweight: torch.Tensor,
1340
+ qzeros_or_none: torch.Tensor | None = None,
1341
+ inplace: bool = False,
1342
+ ):
1343
+ return torch.ops._C.marlin_int4_fp8_preprocess(qweight, qzeros_or_none, inplace)
1344
+
1345
+
1346
+ def gptq_marlin_gemm(
1347
+ a: torch.Tensor,
1348
+ c: torch.Tensor | None,
1349
+ b_q_weight: torch.Tensor,
1350
+ b_bias: torch.Tensor | None,
1351
+ b_scales: torch.Tensor,
1352
+ a_scales: torch.Tensor | None,
1353
+ global_scale: torch.Tensor | None,
1354
+ b_zeros: torch.Tensor | None,
1355
+ g_idx: torch.Tensor | None,
1356
+ perm: torch.Tensor | None,
1357
+ workspace: torch.Tensor,
1358
+ b_q_type: ScalarType,
1359
+ size_m: int,
1360
+ size_n: int,
1361
+ size_k: int,
1362
+ is_k_full: bool = True,
1363
+ use_atomic_add: bool = False,
1364
+ use_fp32_reduce: bool = False,
1365
+ is_zp_float: bool = False,
1366
+ ) -> torch.Tensor:
1367
+ return torch.ops._C.gptq_marlin_gemm(
1368
+ a,
1369
+ c,
1370
+ b_q_weight,
1371
+ b_bias,
1372
+ b_scales,
1373
+ a_scales,
1374
+ global_scale,
1375
+ b_zeros,
1376
+ g_idx,
1377
+ perm,
1378
+ workspace,
1379
+ b_q_type.id,
1380
+ size_m,
1381
+ size_n,
1382
+ size_k,
1383
+ is_k_full,
1384
+ use_atomic_add,
1385
+ use_fp32_reduce,
1386
+ is_zp_float,
1387
+ )
1388
+
1389
+
1390
+ # machete
1391
+ def machete_supported_schedules(
1392
+ a_type: torch.dtype,
1393
+ b_type: ScalarType,
1394
+ group_scales_type: torch.dtype | None,
1395
+ group_zeros_type: torch.dtype | None = None,
1396
+ channel_scales_type: torch.dtype | None = None,
1397
+ token_scales_type: torch.dtype | None = None,
1398
+ out_type: torch.dtype | None = None,
1399
+ ) -> list[str]:
1400
+ return torch.ops._C.machete_supported_schedules(
1401
+ a_type,
1402
+ b_type.id,
1403
+ group_scales_type,
1404
+ group_zeros_type,
1405
+ channel_scales_type,
1406
+ token_scales_type,
1407
+ out_type,
1408
+ )
1409
+
1410
+
1411
+ def machete_mm(
1412
+ a: torch.Tensor,
1413
+ # b_q Should be the tensor returned by machete_prepack_B
1414
+ b_q: torch.Tensor,
1415
+ b_type: ScalarType,
1416
+ out_type: torch.dtype | None = None,
1417
+ b_group_scales: torch.Tensor | None = None,
1418
+ b_group_zeros: torch.Tensor | None = None,
1419
+ b_group_size: int | None = None,
1420
+ b_channel_scales: torch.Tensor | None = None,
1421
+ a_token_scales: torch.Tensor | None = None,
1422
+ schedule: str | None = None,
1423
+ ) -> torch.Tensor:
1424
+ return torch.ops._C.machete_mm(
1425
+ a,
1426
+ b_q,
1427
+ b_type.id,
1428
+ out_type,
1429
+ b_group_scales,
1430
+ b_group_zeros,
1431
+ b_group_size,
1432
+ b_channel_scales,
1433
+ a_token_scales,
1434
+ schedule,
1435
+ )
1436
+
1437
+
1438
+ def machete_prepack_B(
1439
+ b_q_weight: torch.Tensor,
1440
+ a_type: torch.dtype,
1441
+ b_type: ScalarType,
1442
+ group_scales_type: torch.dtype | None,
1443
+ ) -> torch.Tensor:
1444
+ return torch.ops._C.machete_prepack_B(
1445
+ b_q_weight, a_type, b_type.id, group_scales_type
1446
+ )
1447
+
1448
+
1449
+ # CUTLASS W4A8
1450
+ def cutlass_w4a8_mm(
1451
+ a: torch.Tensor,
1452
+ # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
1453
+ b_q: torch.Tensor,
1454
+ b_group_scales: torch.Tensor,
1455
+ b_group_size: int,
1456
+ b_channel_scales: torch.Tensor,
1457
+ a_token_scales: torch.Tensor,
1458
+ out_type: torch.dtype | None = None,
1459
+ maybe_schedule: str | None = None,
1460
+ ) -> torch.Tensor:
1461
+ return torch.ops._C.cutlass_w4a8_mm(
1462
+ a,
1463
+ b_q,
1464
+ b_group_scales,
1465
+ b_group_size,
1466
+ b_channel_scales,
1467
+ a_token_scales,
1468
+ out_type,
1469
+ maybe_schedule,
1470
+ )
1471
+
1472
+
1473
+ def cutlass_pack_scale_fp8(scales: torch.Tensor) -> torch.Tensor:
1474
+ return torch.ops._C.cutlass_pack_scale_fp8(scales)
1475
+
1476
+
1477
+ def cutlass_encode_and_reorder_int4b(b: torch.Tensor) -> torch.Tensor:
1478
+ return torch.ops._C.cutlass_encode_and_reorder_int4b(b)
1479
+
1480
+
1481
+ def cutlass_w4a8_moe_mm(
1482
+ out_tensors: torch.Tensor,
1483
+ a_tensors: torch.Tensor,
1484
+ b_tensors: torch.Tensor,
1485
+ a_scales: torch.Tensor,
1486
+ b_scales: torch.Tensor,
1487
+ b_group_scales: torch.Tensor,
1488
+ b_group_size: int,
1489
+ expert_offsets: torch.Tensor,
1490
+ problem_sizes: torch.Tensor,
1491
+ a_strides: torch.Tensor,
1492
+ b_strides: torch.Tensor,
1493
+ c_strides: torch.Tensor,
1494
+ group_scale_strides: torch.Tensor,
1495
+ maybe_schedule: str | None = None,
1496
+ ):
1497
+ """
1498
+ Executes the CUTLASS-based fused-MoE grouped matrix multiplication for the
1499
+ W4A8 quantization scheme. Uses group-wise quantization (INT4 -> FP8)
1500
+ and both per-channel + per-token scaling in the epilogue.
1501
+
1502
+ Args:
1503
+ out_tensors:
1504
+ Output buffer for all experts (updated in-place).
1505
+ a_tensors:
1506
+ FP8 (E4M3FN) activations for all experts.
1507
+ b_tensors:
1508
+ INT4-packed weight matrix for all experts, packed to INT32
1509
+ a_scales:
1510
+ Per-token FP8 activation scales, applied in the epilogue.
1511
+ b_scales:
1512
+ Per-channel FP8 weight scales for each expert, applied in the epilogue.
1513
+ b_group_scales:
1514
+ FP8 scale values for group-wise INT4 weight blocks.
1515
+ b_group_size:
1516
+ Number of elements grouped under each entry of b_group_scales.
1517
+ expert_offsets:
1518
+ Cumulative token offsets
1519
+ problem_sizes:
1520
+ Per-expert (M, N, K) GEMM sizes used by the grouped GEMM launcher.
1521
+ a/b/c/group_scale_strides:
1522
+ Strides describing the memory layout of the input tensors.
1523
+ maybe_schedule:
1524
+ Optional override to choose a specific kernel or epilogue schedule.
1525
+
1526
+ Returns:
1527
+ out_tensors updated in-place with the dequantized INT4xFP8 grouped GEMM result.
1528
+ """
1529
+ return torch.ops._C.cutlass_w4a8_moe_mm(
1530
+ out_tensors,
1531
+ a_tensors,
1532
+ b_tensors,
1533
+ a_scales,
1534
+ b_scales,
1535
+ b_group_scales,
1536
+ b_group_size,
1537
+ expert_offsets,
1538
+ problem_sizes,
1539
+ a_strides,
1540
+ b_strides,
1541
+ c_strides,
1542
+ group_scale_strides,
1543
+ maybe_schedule,
1544
+ )
1545
+
1546
+
1547
+ def cutlass_encode_and_reorder_int4b_grouped(
1548
+ b_tensors: torch.Tensor,
1549
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1550
+ return torch.ops._C.cutlass_encode_and_reorder_int4b_grouped(b_tensors)
1551
+
1552
+
1553
+ if hasattr(torch.ops._C, "permute_cols"):
1554
+
1555
+ @register_fake("_C::permute_cols")
1556
+ def _permute_cols_fake(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
1557
+ return torch.empty_like(a)
1558
+
1559
+
1560
+ def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
1561
+ return torch.ops._C.permute_cols(a, perm)
1562
+
1563
+
1564
+ # fp4
1565
+ def scaled_fp4_quant(
1566
+ input: torch.Tensor, input_global_scale: torch.Tensor
1567
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1568
+ """
1569
+ Quantize input tensor to FP4 and return quantized tensor and scale.
1570
+
1571
+ This function quantizes the last dimension of the given tensor `input`. For
1572
+ every 16 consecutive elements, a single dynamically computed scaling factor
1573
+ is shared. This scaling factor is quantized using the `input_global_scale`
1574
+ and is stored in a swizzled layout (see
1575
+ https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).
1576
+
1577
+ Args:
1578
+ input: The input tensor to be quantized to FP4
1579
+ input_global_scale: A scalar scaling factor for the entire tensor.
1580
+
1581
+ Returns:
1582
+ tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
1583
+ two values are packed into a uint8 and float8_e4m3 scaling factors
1584
+ in the sizzled layout.
1585
+ """
1586
+ assert not current_platform.is_rocm()
1587
+ assert input.ndim >= 1, f"input.ndim needs to be >= 1, but got {input.ndim}."
1588
+ other_dims = 1 if input.ndim == 1 else -1
1589
+ input = input.reshape(other_dims, input.shape[-1])
1590
+ m, n = input.shape
1591
+ block_size = 16
1592
+ device = input.device
1593
+
1594
+ assert n % block_size == 0, f"last dim has to be multiple of 16, but got {n}."
1595
+ assert input.dtype in (torch.float16, torch.bfloat16), (
1596
+ f"input.dtype needs to be fp16 or bf16 but got {input.dtype}."
1597
+ )
1598
+
1599
+ # Two fp4 values will be packed into an uint8.
1600
+ output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
1601
+
1602
+ # We use the rounded values to store the swizzled values. Due to the
1603
+ # requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
1604
+ # So, we first pad the scales to multiples of 128 and 4. Then, the scales
1605
+ # (in float8_e4m3fn) are packed into an int32 for every 4 values. More:
1606
+ # https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x
1607
+ round_up = lambda x, y: (x + y - 1) // y * y
1608
+ rounded_m = round_up(m, 128)
1609
+ scale_n = n // block_size
1610
+ rounded_n = round_up(scale_n, 4)
1611
+ output_scale = torch.empty(
1612
+ (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
1613
+ )
1614
+
1615
+ torch.ops._C.scaled_fp4_quant(output, input, output_scale, input_global_scale)
1616
+ output_scale = output_scale.view(torch.float8_e4m3fn)
1617
+ return output, output_scale
1618
+
1619
+
1620
+ def scaled_fp4_experts_quant(
1621
+ input_tensor: torch.Tensor,
1622
+ input_global_scale: torch.Tensor,
1623
+ expert_offsets: torch.Tensor,
1624
+ blockscale_offsets: torch.Tensor,
1625
+ topk: int,
1626
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1627
+ """
1628
+ Quantize input tensor to NVFP4 and return quantized tensor and scale, for
1629
+ packed MoE Inputs.
1630
+ Args:
1631
+ input_tensor: The input tensor to be quantized to NVFP4
1632
+ input_global_scale: A scalar scaling factor for the entire tensor.
1633
+ expert_offsets: The expert offsets tensor
1634
+ blockscale_offsets: The blockscale offsets tensor
1635
+ Outputs:
1636
+ output: The quantized tensor in NVFP4
1637
+ output_scales: The blockscale tensor in FP8-E4M3
1638
+ """
1639
+ assert not current_platform.is_rocm()
1640
+ assert input_tensor.ndim == 2, (
1641
+ f"input.ndim needs to be == 2, but got {input_tensor.ndim}."
1642
+ )
1643
+
1644
+ # Control the maximum number of tokens per expert supported by the
1645
+ # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
1646
+ # from running out of memory. This value can also be increased to support
1647
+ # larger models.
1648
+ MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
1649
+ m_numtopk, k = input_tensor.shape
1650
+
1651
+ assert m_numtopk <= MAX_TOKENS_PER_EXPERT * topk, (
1652
+ f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
1653
+ f"{MAX_TOKENS_PER_EXPERT})"
1654
+ f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use"
1655
+ f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value."
1656
+ )
1657
+ scales_k = k // 16
1658
+ padded_k = (scales_k + (4 - 1)) // 4
1659
+
1660
+ # output is uint8 and packed fp4 values
1661
+ output = torch.empty(
1662
+ m_numtopk, k // 2, device=input_tensor.device, dtype=torch.uint8
1663
+ )
1664
+ output_scales = torch.empty(
1665
+ MAX_TOKENS_PER_EXPERT * topk,
1666
+ padded_k,
1667
+ dtype=torch.int32,
1668
+ device=input_tensor.device,
1669
+ )
1670
+ torch.ops._C.scaled_fp4_experts_quant(
1671
+ output,
1672
+ output_scales,
1673
+ input_tensor,
1674
+ input_global_scale,
1675
+ expert_offsets,
1676
+ blockscale_offsets,
1677
+ )
1678
+ output_scales = output_scales.view(torch.float8_e4m3fn)
1679
+ return output, output_scales
1680
+
1681
+
1682
+ def silu_and_mul_scaled_fp4_experts_quant(
1683
+ input_tensor: torch.Tensor,
1684
+ input_global_scale: torch.Tensor,
1685
+ expert_offsets: torch.Tensor,
1686
+ blockscale_offsets: torch.Tensor,
1687
+ topk: int,
1688
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1689
+ """
1690
+ Fused SiLU+Mul+NVFP4 quantization for MoE intermediate activations.
1691
+
1692
+ Args:
1693
+ input_tensor: The input tensor with gate || up layout [m_topk, k*2]
1694
+ input_global_scale: A per-expert scaling factor [n_experts]
1695
+ expert_offsets: The expert offsets tensor [n_experts+1]
1696
+ blockscale_offsets: The blockscale offsets tensor [n_experts+1]
1697
+ topk: Number of top-k experts selected
1698
+ Outputs:
1699
+ output: The quantized tensor in NVFP4 [m_topk, k/2]
1700
+ output_scales: The blockscale tensor in FP8-E4M3
1701
+ """
1702
+ assert not current_platform.is_rocm()
1703
+ assert input_tensor.ndim == 2, (
1704
+ f"input.ndim needs to be == 2, but got {input_tensor.ndim}."
1705
+ )
1706
+
1707
+ # Control the maximum number of tokens per expert supported by the
1708
+ # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
1709
+ # from running out of memory. This value can also be increased to support
1710
+ # larger models.
1711
+ MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
1712
+ m_numtopk, k_times_2 = input_tensor.shape
1713
+ assert k_times_2 % 2 == 0, "input width must be even (gate || up layout)"
1714
+ k = k_times_2 // 2
1715
+
1716
+ assert m_numtopk <= MAX_TOKENS_PER_EXPERT * topk, (
1717
+ f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
1718
+ f"{MAX_TOKENS_PER_EXPERT})"
1719
+ f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use"
1720
+ f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value."
1721
+ )
1722
+ scales_k = k // 16
1723
+ padded_k = (scales_k + (4 - 1)) // 4
1724
+
1725
+ # output is uint8 and packed fp4 values
1726
+ output = torch.empty(
1727
+ m_numtopk, k // 2, device=input_tensor.device, dtype=torch.uint8
1728
+ )
1729
+ output_scales = torch.empty(
1730
+ MAX_TOKENS_PER_EXPERT * topk,
1731
+ padded_k,
1732
+ dtype=torch.int32,
1733
+ device=input_tensor.device,
1734
+ )
1735
+ torch.ops._C.silu_and_mul_scaled_fp4_experts_quant(
1736
+ output,
1737
+ output_scales,
1738
+ input_tensor,
1739
+ input_global_scale,
1740
+ expert_offsets,
1741
+ blockscale_offsets,
1742
+ )
1743
+ output_scales = output_scales.view(torch.float8_e4m3fn)
1744
+ return output, output_scales
1745
+
1746
+
1747
+ # fp8
1748
+ def scaled_fp8_quant(
1749
+ input: torch.Tensor,
1750
+ scale: torch.Tensor | None = None,
1751
+ num_token_padding: int | None = None,
1752
+ scale_ub: torch.Tensor | None = None,
1753
+ use_per_token_if_dynamic: bool = False,
1754
+ output: torch.Tensor | None = None,
1755
+ group_shape: tuple[int, int] | None = None,
1756
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1757
+ """
1758
+ Quantize input tensor to FP8 and return quantized tensor and scale.
1759
+
1760
+ This function supports both static and dynamic quantization: If you
1761
+ provide the scale, it will use static scaling and if you omit it,
1762
+ the scale will be determined dynamically. The function also allows
1763
+ optional padding of the output tensors for downstream kernels that
1764
+ will benefit from padding.
1765
+
1766
+ Args:
1767
+ input: The input tensor to be quantized to FP8 (must be 2D: [M, N])
1768
+ scale: Optional scaling factor for the FP8 quantization. Supports:
1769
+ - 0D or [1]: per-tensor scaling
1770
+ - 1D: requires explicit group_shape to disambiguate per-channel
1771
+ vs per-token (use (-1, 1) for per-channel, (1, -1) for per-token)
1772
+ - 2D [M/group_m, N/group_n]: group scaling (e.g. [M, N/128] for
1773
+ DeepSeek-style (1,128) groups, or [M/128, N/128] for (128,128))
1774
+ scale_ub: Optional upper bound for scaling factor in dynamic
1775
+ per token case
1776
+ num_token_padding: If specified, pad the first dimension
1777
+ of the output to at least this value.
1778
+ use_per_token_if_dynamic: Whether to do per_tensor or per_token
1779
+ in the dynamic quantization case.
1780
+ group_shape: Optional tuple (group_m, group_n) specifying the group
1781
+ shape for static quantization. Use -1 for "full extent" (e.g.,
1782
+ (-1, -1) for per-tensor, (-1, 1) for per-channel, etc.)
1783
+ Required for 1D scales; optional for 2D scales.
1784
+
1785
+ Returns:
1786
+ tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
1787
+ scaling factor.
1788
+ """
1789
+ # This code assumes batch_dim and num_tokens are flattened
1790
+ assert input.ndim == 2
1791
+ shape: tuple[int, int] | torch.Size = input.shape
1792
+ # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
1793
+ out_dtype: torch.dtype = current_platform.fp8_dtype()
1794
+ if num_token_padding:
1795
+ shape = (max(num_token_padding, input.shape[0]), shape[1])
1796
+ if output is None:
1797
+ output = torch.empty(shape, device=input.device, dtype=out_dtype)
1798
+ else:
1799
+ assert num_token_padding is None, "padding not supported if output passed in"
1800
+ assert output.dtype == out_dtype
1801
+
1802
+ if scale is None:
1803
+ if use_per_token_if_dynamic:
1804
+ scale = torch.empty((shape[0], 1), device=input.device, dtype=torch.float32)
1805
+ torch.ops._C.dynamic_per_token_scaled_fp8_quant(
1806
+ output, input, scale, scale_ub
1807
+ )
1808
+ else:
1809
+ scale = torch.empty(1, device=input.device, dtype=torch.float32)
1810
+ torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
1811
+ else:
1812
+ torch.ops._C.static_scaled_fp8_quant(output, input, scale, group_shape)
1813
+
1814
+ return output, scale
1815
+
1816
+
1817
+ # gptq allspark
1818
+ def allspark_repack_weight(
1819
+ qweight: torch.Tensor,
1820
+ scale: torch.Tensor,
1821
+ zero_point: torch.Tensor | None = None,
1822
+ has_zp: bool = False,
1823
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
1824
+ """
1825
+ Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format
1826
+ for Ampere W8A16 Fused Gemm kernel
1827
+
1828
+ Args:
1829
+ qweight: uint8 weight tensor, original k x n format.
1830
+ scale: fp16/bf16 weight scale tensor, 1 x n format.
1831
+ zero_point: fp16/bf16 weight zero_point tensor, 1 x n format.
1832
+ Must be provided for asymmetric quantization.
1833
+ has_zp: if use symmetric quantization, has_zp = False.
1834
+ if use asymmetric quantization, has_zp = True.
1835
+
1836
+ Returns:
1837
+ tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] :
1838
+ rearranged weight, scale, and optionally zero_point.
1839
+ """
1840
+ K = qweight.shape[0]
1841
+ N = qweight.shape[1]
1842
+ N_32align = (N + 32 - 1) // 32 * 32
1843
+
1844
+ qweight_reorder = torch.empty(
1845
+ (N_32align, K), device=qweight.device, dtype=qweight.dtype
1846
+ )
1847
+ scale_reorder = torch.empty((1, N_32align), device=scale.device, dtype=scale.dtype)
1848
+ zero_point_reorder = None
1849
+ if has_zp:
1850
+ assert zero_point is not None, (
1851
+ "zero_point must be provided for asymmetric quantization."
1852
+ )
1853
+ zero_point_reorder = torch.empty(
1854
+ (1, N_32align), device=zero_point.device, dtype=zero_point.dtype
1855
+ )
1856
+
1857
+ torch.ops._C.rearrange_kn_weight_as_n32k16_order(
1858
+ qweight,
1859
+ scale,
1860
+ zero_point,
1861
+ has_zp,
1862
+ qweight_reorder,
1863
+ scale_reorder,
1864
+ zero_point_reorder,
1865
+ K,
1866
+ N,
1867
+ N_32align,
1868
+ )
1869
+
1870
+ return qweight_reorder, scale_reorder, zero_point_reorder
1871
+
1872
+
1873
+ def allspark_w8a16_gemm(
1874
+ a: torch.Tensor,
1875
+ b_qweight: torch.Tensor,
1876
+ b_scales: torch.Tensor,
1877
+ b_qzeros: torch.Tensor | None,
1878
+ n: int,
1879
+ group_size: int,
1880
+ sm_count: int,
1881
+ sm_version: int,
1882
+ CUBLAS_M_THRESHOLD: int,
1883
+ has_zp: bool,
1884
+ n32k16_reorder: bool,
1885
+ ) -> torch.Tensor:
1886
+ return torch.ops._C.allspark_w8a16_gemm(
1887
+ a,
1888
+ b_qweight,
1889
+ b_scales,
1890
+ b_qzeros,
1891
+ n,
1892
+ group_size,
1893
+ sm_count,
1894
+ sm_version,
1895
+ CUBLAS_M_THRESHOLD,
1896
+ has_zp,
1897
+ n32k16_reorder,
1898
+ )
1899
+
1900
+
1901
+ # int8
1902
+ def scaled_int8_quant(
1903
+ input: torch.Tensor,
1904
+ scale: torch.Tensor | None = None,
1905
+ azp: torch.Tensor | None = None,
1906
+ symmetric: bool = True,
1907
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
1908
+ """
1909
+ Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
1910
+
1911
+ Args:
1912
+ input: The input tensor to be quantized to int8.
1913
+ scale: Optional scaling factor for the int8 quantization.
1914
+ When not provided, we invoke dynamic-per-token quantization.
1915
+ azp: Optional zero-point for the int8 quantization.
1916
+ Must be provided for asymmetric quantization if `scale` is provided.
1917
+ symmetric: Whether to use symmetric quantization (scale only, azp ignored).
1918
+
1919
+ Returns:
1920
+ tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
1921
+ """
1922
+ output = torch.empty_like(input, dtype=torch.int8)
1923
+ if scale is not None:
1924
+ # static-per-tensor quantization.
1925
+ assert symmetric == (azp is None), (
1926
+ "azp must only be provided for asymmetric quantization."
1927
+ )
1928
+ torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
1929
+ return output, scale, azp
1930
+
1931
+ # dynamic-per-token quantization.
1932
+ input_scales = torch.empty(
1933
+ (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32
1934
+ )
1935
+ input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32)
1936
+ torch.ops._C.dynamic_scaled_int8_quant(
1937
+ output, input.contiguous(), input_scales, input_azp
1938
+ )
1939
+ return output, input_scales, input_azp
1940
+
1941
+
1942
+ # gguf
1943
+ def ggml_dequantize(
1944
+ W: torch.Tensor, quant_type: int, m: int, n: int, dtype: torch.dtype | None
1945
+ ) -> torch.Tensor:
1946
+ return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype)
1947
+
1948
+
1949
+ def ggml_mul_mat_vec_a8(
1950
+ W: torch.Tensor,
1951
+ X: torch.Tensor,
1952
+ quant_type: int,
1953
+ row: int,
1954
+ ) -> torch.Tensor:
1955
+ return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row)
1956
+
1957
+
1958
+ def ggml_mul_mat_a8(
1959
+ W: torch.Tensor,
1960
+ X: torch.Tensor,
1961
+ quant_type: int,
1962
+ row: int,
1963
+ ) -> torch.Tensor:
1964
+ return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row)
1965
+
1966
+
1967
+ def ggml_moe_a8(
1968
+ X: torch.Tensor,
1969
+ W: torch.Tensor,
1970
+ sorted_token_ids: torch.Tensor,
1971
+ expert_ids: torch.Tensor,
1972
+ num_tokens_post_padded: torch.Tensor,
1973
+ quant_type: int,
1974
+ row: int,
1975
+ top_k: int,
1976
+ tokens: int,
1977
+ ) -> torch.Tensor:
1978
+ return torch.ops._C.ggml_moe_a8(
1979
+ X,
1980
+ W,
1981
+ sorted_token_ids,
1982
+ expert_ids,
1983
+ num_tokens_post_padded,
1984
+ quant_type,
1985
+ row,
1986
+ top_k,
1987
+ tokens,
1988
+ )
1989
+
1990
+
1991
+ def ggml_moe_a8_vec(
1992
+ X: torch.Tensor,
1993
+ W: torch.Tensor,
1994
+ topk_ids: torch.Tensor,
1995
+ top_k: int,
1996
+ quant_type: int,
1997
+ row: torch.SymInt,
1998
+ tokens: torch.SymInt,
1999
+ ) -> torch.Tensor:
2000
+ return torch.ops._C.ggml_moe_a8_vec(X, W, topk_ids, top_k, quant_type, row, tokens)
2001
+
2002
+
2003
+ def ggml_moe_get_block_size(quant_type: int) -> int:
2004
+ return torch.ops._C.ggml_moe_get_block_size(quant_type)
2005
+
2006
+
2007
+ # mamba
2008
+ def selective_scan_fwd(
2009
+ u: torch.Tensor,
2010
+ delta: torch.Tensor,
2011
+ A: torch.Tensor,
2012
+ B: torch.Tensor,
2013
+ C: torch.Tensor,
2014
+ D_: torch.Tensor | None,
2015
+ z_: torch.Tensor | None,
2016
+ delta_bias_: torch.Tensor | None,
2017
+ delta_softplus: bool,
2018
+ query_start_loc: torch.Tensor | None,
2019
+ cache_indices: torch.Tensor | None,
2020
+ has_initial_state: torch.Tensor | None,
2021
+ ssm_states: torch.Tensor,
2022
+ pad_slot_id: int,
2023
+ block_size: int = 1024,
2024
+ block_idx_first_scheduled_token: torch.Tensor | None = None,
2025
+ block_idx_last_scheduled_token: torch.Tensor | None = None,
2026
+ initial_state_idx: torch.Tensor | None = None,
2027
+ ):
2028
+ torch.ops._C.selective_scan_fwd(
2029
+ u,
2030
+ delta,
2031
+ A,
2032
+ B,
2033
+ C,
2034
+ D_,
2035
+ z_,
2036
+ delta_bias_,
2037
+ delta_softplus,
2038
+ query_start_loc,
2039
+ cache_indices,
2040
+ has_initial_state,
2041
+ ssm_states,
2042
+ pad_slot_id,
2043
+ block_size,
2044
+ block_idx_first_scheduled_token,
2045
+ block_idx_last_scheduled_token,
2046
+ initial_state_idx,
2047
+ )
2048
+
2049
+
2050
+ # ROCm skinny gemms
2051
+ def LLMM1(a: torch.Tensor, b: torch.Tensor, rows_per_block: int) -> torch.Tensor:
2052
+ return torch.ops._rocm_C.LLMM1(a, b, rows_per_block)
2053
+
2054
+
2055
+ def wvSplitK(
2056
+ a: torch.Tensor, b: torch.Tensor, cu_count: int, bias: torch.Tensor = None
2057
+ ) -> torch.Tensor:
2058
+ return torch.ops._rocm_C.wvSplitK(a, b, bias, cu_count)
2059
+
2060
+
2061
+ def wvSplitKQ(
2062
+ a: torch.Tensor,
2063
+ b: torch.Tensor,
2064
+ out_dtype: torch.dtype,
2065
+ scale_a: torch.Tensor,
2066
+ scale_b: torch.Tensor,
2067
+ cu_count: int,
2068
+ bias: torch.Tensor = None,
2069
+ ) -> torch.Tensor:
2070
+ out = torch.empty((b.shape[0], a.shape[0]), dtype=out_dtype, device=b.device)
2071
+ torch.ops._rocm_C.wvSplitKQ(a, b, bias, out, scale_a, scale_b, cu_count)
2072
+ return out
2073
+
2074
+
2075
+ # moe
2076
+ def moe_sum(input: torch.Tensor, output: torch.Tensor):
2077
+ torch.ops._moe_C.moe_sum(input, output)
2078
+
2079
+
2080
+ def moe_align_block_size(
2081
+ topk_ids: torch.Tensor,
2082
+ num_experts: int,
2083
+ block_size: int,
2084
+ sorted_token_ids: torch.Tensor,
2085
+ experts_ids: torch.Tensor,
2086
+ num_tokens_post_pad: torch.Tensor,
2087
+ expert_map: torch.Tensor | None = None,
2088
+ ) -> None:
2089
+ torch.ops._moe_C.moe_align_block_size(
2090
+ topk_ids,
2091
+ num_experts,
2092
+ block_size,
2093
+ sorted_token_ids,
2094
+ experts_ids,
2095
+ num_tokens_post_pad,
2096
+ expert_map,
2097
+ )
2098
+
2099
+
2100
+ def batched_moe_align_block_size(
2101
+ max_tokens_per_batch: int,
2102
+ block_size: int,
2103
+ expert_num_tokens: torch.Tensor,
2104
+ sorted_ids: torch.Tensor,
2105
+ expert_ids: torch.Tensor,
2106
+ num_tokens_post_pad: torch.Tensor,
2107
+ ) -> None:
2108
+ torch.ops._moe_C.batched_moe_align_block_size(
2109
+ max_tokens_per_batch,
2110
+ block_size,
2111
+ expert_num_tokens,
2112
+ sorted_ids,
2113
+ expert_ids,
2114
+ num_tokens_post_pad,
2115
+ )
2116
+
2117
+
2118
+ def moe_lora_align_block_size(
2119
+ topk_ids: torch.Tensor,
2120
+ token_lora_mapping: torch.Tensor,
2121
+ num_experts: int,
2122
+ block_size: int,
2123
+ max_loras: int,
2124
+ max_num_tokens_padded: int,
2125
+ max_num_m_blocks: int,
2126
+ sorted_token_ids: torch.Tensor,
2127
+ experts_ids: torch.Tensor,
2128
+ num_tokens_post_pad: torch.Tensor,
2129
+ adapter_enabled: torch.Tensor,
2130
+ lora_ids: torch.Tensor,
2131
+ expert_map: torch.Tensor | None = None,
2132
+ ) -> None:
2133
+ torch.ops._moe_C.moe_lora_align_block_size(
2134
+ topk_ids,
2135
+ token_lora_mapping,
2136
+ num_experts,
2137
+ block_size,
2138
+ max_loras,
2139
+ max_num_tokens_padded,
2140
+ max_num_m_blocks,
2141
+ sorted_token_ids,
2142
+ experts_ids,
2143
+ num_tokens_post_pad,
2144
+ adapter_enabled,
2145
+ lora_ids,
2146
+ expert_map,
2147
+ )
2148
+
2149
+
2150
+ def moe_wna16_gemm(
2151
+ input: torch.Tensor,
2152
+ output: torch.Tensor,
2153
+ b_qweight: torch.Tensor,
2154
+ b_scales: torch.Tensor,
2155
+ b_qzeros: torch.Tensor | None,
2156
+ topk_weights: torch.Tensor | None,
2157
+ sorted_token_ids: torch.Tensor,
2158
+ experts_ids: torch.Tensor,
2159
+ num_tokens_post_pad: torch.Tensor,
2160
+ top_k: int,
2161
+ BLOCK_SIZE_M: int,
2162
+ BLOCK_SIZE_N: int,
2163
+ BLOCK_SIZE_K: int,
2164
+ bit: int,
2165
+ ) -> torch.Tensor:
2166
+ if not current_platform.is_cuda():
2167
+ raise NotImplementedError(
2168
+ "The optimized moe_wna16_gemm kernel is only available on CUDA platforms"
2169
+ )
2170
+ torch.ops._moe_C.moe_wna16_gemm(
2171
+ input,
2172
+ output,
2173
+ b_qweight,
2174
+ b_scales,
2175
+ b_qzeros,
2176
+ topk_weights,
2177
+ sorted_token_ids,
2178
+ experts_ids,
2179
+ num_tokens_post_pad,
2180
+ top_k,
2181
+ BLOCK_SIZE_M,
2182
+ BLOCK_SIZE_N,
2183
+ BLOCK_SIZE_K,
2184
+ bit,
2185
+ )
2186
+
2187
+
2188
+ def topk_softmax(
2189
+ topk_weights: torch.Tensor,
2190
+ topk_ids: torch.Tensor,
2191
+ token_expert_indices: torch.Tensor,
2192
+ gating_output: torch.Tensor,
2193
+ renormalize: bool = False,
2194
+ ) -> None:
2195
+ torch.ops._moe_C.topk_softmax(
2196
+ topk_weights, topk_ids, token_expert_indices, gating_output, renormalize
2197
+ )
2198
+
2199
+
2200
+ def grouped_topk(
2201
+ scores: torch.Tensor,
2202
+ num_expert_group: int,
2203
+ topk_group: int,
2204
+ topk: int,
2205
+ renormalize: bool,
2206
+ routed_scaling_factor: float,
2207
+ bias: torch.Tensor,
2208
+ scoring_func: int = 0,
2209
+ ):
2210
+ """
2211
+ Perform grouped top-k routing for mixture of experts.
2212
+
2213
+ Args:
2214
+ scores: Raw inputs (logits if scoring_func=1, scores if scoring_func=0)
2215
+ num_expert_group: Number of expert groups
2216
+ topk_group: Number of groups to select
2217
+ topk: Number of experts to select per token
2218
+ renormalize: Whether to renormalize the output weights
2219
+ routed_scaling_factor: Scaling factor for routing weights
2220
+ bias: Bias tensor (e_score_correction_bias). Always fused in kernel.
2221
+ scoring_func: 0=none (no activation), 1=sigmoid
2222
+ """
2223
+ if not current_platform.is_cuda():
2224
+ raise NotImplementedError(
2225
+ "The fused grouped_topk kernel is only available on CUDA platforms"
2226
+ )
2227
+ return torch.ops._moe_C.grouped_topk(
2228
+ scores,
2229
+ num_expert_group,
2230
+ topk_group,
2231
+ topk,
2232
+ renormalize,
2233
+ routed_scaling_factor,
2234
+ bias,
2235
+ scoring_func,
2236
+ )
2237
+
2238
+
2239
+ def moe_wna16_marlin_gemm(
2240
+ input: torch.Tensor,
2241
+ output: torch.Tensor | None,
2242
+ b_qweight: torch.Tensor,
2243
+ b_bias: torch.Tensor | None,
2244
+ b_scales: torch.Tensor,
2245
+ a_scales: torch.Tensor | None,
2246
+ global_scale: torch.Tensor | None,
2247
+ b_qzeros: torch.Tensor | None,
2248
+ g_idx: torch.Tensor | None,
2249
+ perm: torch.Tensor | None,
2250
+ workspace: torch.Tensor,
2251
+ sorted_token_ids: torch.Tensor,
2252
+ expert_ids: torch.Tensor,
2253
+ num_tokens_past_padded: torch.Tensor,
2254
+ topk_weights: torch.Tensor,
2255
+ moe_block_size: int,
2256
+ top_k: int,
2257
+ mul_topk_weights: bool,
2258
+ b_q_type: ScalarType,
2259
+ size_m: int,
2260
+ size_n: int,
2261
+ size_k: int,
2262
+ is_k_full: bool,
2263
+ use_atomic_add: bool,
2264
+ use_fp32_reduce: bool,
2265
+ is_zp_float: bool,
2266
+ thread_k: int = -1,
2267
+ thread_n: int = -1,
2268
+ blocks_per_sm: int = -1,
2269
+ ) -> torch.Tensor:
2270
+ return torch.ops._moe_C.moe_wna16_marlin_gemm(
2271
+ input,
2272
+ output,
2273
+ b_qweight,
2274
+ b_bias,
2275
+ b_scales,
2276
+ a_scales,
2277
+ global_scale,
2278
+ b_qzeros,
2279
+ g_idx,
2280
+ perm,
2281
+ workspace,
2282
+ sorted_token_ids,
2283
+ expert_ids,
2284
+ num_tokens_past_padded,
2285
+ topk_weights,
2286
+ moe_block_size,
2287
+ top_k,
2288
+ mul_topk_weights,
2289
+ b_q_type.id,
2290
+ size_m,
2291
+ size_n,
2292
+ size_k,
2293
+ is_k_full,
2294
+ use_atomic_add,
2295
+ use_fp32_reduce,
2296
+ is_zp_float,
2297
+ thread_k,
2298
+ thread_n,
2299
+ blocks_per_sm,
2300
+ )
2301
+
2302
+
2303
+ if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
2304
+
2305
+ @register_fake("_moe_C::marlin_gemm_moe")
2306
+ def marlin_gemm_moe_fake(
2307
+ a: torch.Tensor,
2308
+ b_q_weights: torch.Tensor,
2309
+ sorted_ids: torch.Tensor,
2310
+ topk_weights: torch.Tensor,
2311
+ topk_ids: torch.Tensor,
2312
+ b_scales: torch.Tensor,
2313
+ b_zero_points: torch.Tensor,
2314
+ g_idx: torch.Tensor,
2315
+ perm: torch.Tensor,
2316
+ workspace: torch.Tensor,
2317
+ b_q_type: ScalarType,
2318
+ size_m: torch.SymInt,
2319
+ size_n: torch.SymInt,
2320
+ size_k: torch.SymInt,
2321
+ is_k_full: bool,
2322
+ num_experts: int,
2323
+ topk: int,
2324
+ moe_block_size: int,
2325
+ replicate_input: bool,
2326
+ apply_weights: bool,
2327
+ ) -> torch.Tensor:
2328
+ return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
2329
+
2330
+ @register_fake("_moe_C::moe_wna16_marlin_gemm")
2331
+ def moe_wna16_marlin_gemm_fake(
2332
+ input: torch.Tensor,
2333
+ output: torch.Tensor | None,
2334
+ b_qweight: torch.Tensor,
2335
+ b_bias: torch.Tensor | None,
2336
+ b_scales: torch.Tensor,
2337
+ a_scales: torch.Tensor | None,
2338
+ global_scale: torch.Tensor | None,
2339
+ b_qzeros: torch.Tensor | None,
2340
+ g_idx: torch.Tensor | None,
2341
+ perm: torch.Tensor | None,
2342
+ workspace: torch.Tensor,
2343
+ sorted_token_ids: torch.Tensor,
2344
+ expert_ids: torch.Tensor,
2345
+ num_tokens_past_padded: torch.Tensor,
2346
+ topk_weights: torch.Tensor,
2347
+ moe_block_size: int,
2348
+ top_k: int,
2349
+ mul_topk_weights: bool,
2350
+ b_q_type: ScalarType,
2351
+ size_m: int,
2352
+ size_n: int,
2353
+ size_k: int,
2354
+ is_k_full: bool,
2355
+ use_atomic_add: bool,
2356
+ use_fp32_reduce: bool,
2357
+ is_zp_float: bool,
2358
+ ):
2359
+ return torch.empty(
2360
+ (size_m * top_k, size_n), dtype=input.dtype, device=input.device
2361
+ )
2362
+
2363
+
2364
+ def reshape_and_cache(
2365
+ key: torch.Tensor,
2366
+ value: torch.Tensor,
2367
+ key_cache: torch.Tensor,
2368
+ value_cache: torch.Tensor,
2369
+ slot_mapping: torch.Tensor,
2370
+ kv_cache_dtype: str,
2371
+ k_scale: torch.Tensor,
2372
+ v_scale: torch.Tensor,
2373
+ ) -> None:
2374
+ torch.ops._C_cache_ops.reshape_and_cache(
2375
+ key,
2376
+ value,
2377
+ key_cache,
2378
+ value_cache,
2379
+ slot_mapping,
2380
+ kv_cache_dtype,
2381
+ k_scale,
2382
+ v_scale,
2383
+ )
2384
+
2385
+
2386
+ def reshape_and_cache_flash(
2387
+ key: torch.Tensor,
2388
+ value: torch.Tensor,
2389
+ key_cache: torch.Tensor,
2390
+ value_cache: torch.Tensor,
2391
+ slot_mapping: torch.Tensor,
2392
+ kv_cache_dtype: str,
2393
+ k_scale: torch.Tensor,
2394
+ v_scale: torch.Tensor,
2395
+ ) -> None:
2396
+ torch.ops._C_cache_ops.reshape_and_cache_flash(
2397
+ key,
2398
+ value,
2399
+ key_cache,
2400
+ value_cache,
2401
+ slot_mapping,
2402
+ kv_cache_dtype,
2403
+ k_scale,
2404
+ v_scale,
2405
+ )
2406
+
2407
+
2408
+ def concat_and_cache_mla(
2409
+ kv_c: torch.Tensor,
2410
+ k_pe: torch.Tensor,
2411
+ kv_cache: torch.Tensor,
2412
+ slot_mapping: torch.Tensor,
2413
+ kv_cache_dtype: str,
2414
+ scale: torch.Tensor,
2415
+ ) -> None:
2416
+ torch.ops._C_cache_ops.concat_and_cache_mla(
2417
+ kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale
2418
+ )
2419
+
2420
+
2421
+ def concat_and_cache_mla_rope_fused(
2422
+ positions: torch.Tensor,
2423
+ q_pe: torch.Tensor,
2424
+ k_pe: torch.Tensor,
2425
+ kv_c: torch.Tensor,
2426
+ cos_sin_cache: torch.Tensor,
2427
+ is_neox: bool,
2428
+ slot_mapping: torch.Tensor,
2429
+ kv_cache: torch.Tensor,
2430
+ kv_cache_dtype: str,
2431
+ kv_cache_scale: torch.Tensor,
2432
+ ) -> None:
2433
+ torch.ops._C_cache_ops.concat_and_cache_mla_rope_fused(
2434
+ positions,
2435
+ q_pe,
2436
+ k_pe,
2437
+ kv_c,
2438
+ cos_sin_cache,
2439
+ is_neox,
2440
+ slot_mapping,
2441
+ kv_cache,
2442
+ kv_cache_dtype,
2443
+ kv_cache_scale,
2444
+ )
2445
+
2446
+
2447
+ def swap_blocks(
2448
+ src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
2449
+ ) -> None:
2450
+ torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
2451
+
2452
+
2453
+ def convert_fp8(
2454
+ output: torch.Tensor, input: torch.Tensor, scale: float = 1.0, kv_dtype: str = "fp8"
2455
+ ) -> None:
2456
+ torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
2457
+
2458
+
2459
+ def gather_and_maybe_dequant_cache(
2460
+ src_cache: torch.Tensor,
2461
+ dst: torch.Tensor,
2462
+ block_table: torch.Tensor,
2463
+ cu_seq_lens: torch.Tensor,
2464
+ token_to_seq: torch.Tensor,
2465
+ num_tokens: int,
2466
+ kv_cache_dtype: str,
2467
+ scale: torch.Tensor,
2468
+ seq_starts: torch.Tensor | None = None,
2469
+ ) -> None:
2470
+ torch.ops._C_cache_ops.gather_and_maybe_dequant_cache(
2471
+ src_cache,
2472
+ dst,
2473
+ block_table,
2474
+ cu_seq_lens,
2475
+ token_to_seq,
2476
+ num_tokens,
2477
+ kv_cache_dtype,
2478
+ scale,
2479
+ seq_starts,
2480
+ )
2481
+
2482
+
2483
+ def cp_gather_cache(
2484
+ src_cache: torch.Tensor,
2485
+ dst: torch.Tensor,
2486
+ block_table: torch.Tensor,
2487
+ cu_seq_lens: torch.Tensor,
2488
+ batch_size: int,
2489
+ seq_starts: torch.Tensor | None = None,
2490
+ ) -> None:
2491
+ torch.ops._C_cache_ops.cp_gather_cache(
2492
+ src_cache, dst, block_table, cu_seq_lens, batch_size, seq_starts
2493
+ )
2494
+
2495
+
2496
+ def cp_gather_and_upconvert_fp8_kv_cache(
2497
+ src_cache: torch.Tensor,
2498
+ dst: torch.Tensor,
2499
+ block_table: torch.Tensor,
2500
+ seq_lens: torch.Tensor,
2501
+ workspace_starts: torch.Tensor,
2502
+ batch_size: int,
2503
+ ) -> None:
2504
+ """Gather and upconvert FP8 KV cache to BF16 workspace.
2505
+
2506
+ Args:
2507
+ src_cache: FP8 KV cache [num_blocks, block_size, 656]
2508
+ dst: BF16 output workspace [total_tokens, 576]
2509
+ block_table: Block indices [num_reqs, max_blocks]
2510
+ seq_lens: Sequence lengths [num_reqs]
2511
+ workspace_starts: Workspace start offsets [num_reqs]
2512
+ batch_size: Number of requests
2513
+ """
2514
+ torch.ops._C_cache_ops.cp_gather_and_upconvert_fp8_kv_cache(
2515
+ src_cache, dst, block_table, seq_lens, workspace_starts, batch_size
2516
+ )
2517
+
2518
+
2519
+ def indexer_k_quant_and_cache(
2520
+ k: torch.Tensor,
2521
+ kv_cache: torch.Tensor,
2522
+ slot_mapping: torch.Tensor,
2523
+ quant_block_size: int,
2524
+ kv_cache_dtype: str,
2525
+ ) -> None:
2526
+ torch.ops._C_cache_ops.indexer_k_quant_and_cache(
2527
+ k, kv_cache, slot_mapping, quant_block_size, kv_cache_dtype
2528
+ )
2529
+
2530
+
2531
+ def cp_gather_indexer_k_quant_cache(
2532
+ kv_cache: torch.Tensor,
2533
+ dst_k: torch.Tensor,
2534
+ dst_scale: torch.Tensor,
2535
+ block_table: torch.Tensor,
2536
+ cu_seq_lens: torch.Tensor,
2537
+ ) -> None:
2538
+ torch.ops._C_cache_ops.cp_gather_indexer_k_quant_cache(
2539
+ kv_cache, dst_k, dst_scale, block_table, cu_seq_lens
2540
+ )
2541
+
2542
+
2543
+ def get_device_attribute(attribute: int, device: int) -> int:
2544
+ return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
2545
+
2546
+
2547
+ def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
2548
+ # ruff: noqa: E501
2549
+ return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute(
2550
+ device
2551
+ )
2552
+
2553
+
2554
+ # custom ar
2555
+ def init_custom_ar(
2556
+ ipc_tensors: list[torch.Tensor],
2557
+ rank_data: torch.Tensor,
2558
+ rank: int,
2559
+ fully_connected: bool,
2560
+ ) -> int:
2561
+ return torch.ops._C_custom_ar.init_custom_ar(
2562
+ ipc_tensors, rank_data, rank, fully_connected
2563
+ )
2564
+
2565
+
2566
+ def all_reduce(
2567
+ fa: int,
2568
+ inp: torch.Tensor,
2569
+ out: torch.Tensor,
2570
+ reg_buffer: int,
2571
+ reg_buffer_sz_bytes: int,
2572
+ ) -> None:
2573
+ torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
2574
+
2575
+
2576
+ def dispose(fa: int) -> None:
2577
+ torch.ops._C_custom_ar.dispose(fa)
2578
+
2579
+
2580
+ def meta_size() -> int:
2581
+ return torch.ops._C_custom_ar.meta_size()
2582
+
2583
+
2584
+ def register_buffer(fa: int, ipc_tensors: list[int]) -> None:
2585
+ return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
2586
+
2587
+
2588
+ def get_graph_buffer_ipc_meta(fa: int) -> tuple[list[int], list[int]]:
2589
+ return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
2590
+
2591
+
2592
+ def register_graph_buffers(
2593
+ fa: int, handles: list[list[int]], offsets: list[list[int]]
2594
+ ) -> None:
2595
+ torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
2596
+
2597
+
2598
+ def allocate_shared_buffer_and_handle(size: int) -> tuple[int, torch.Tensor]:
2599
+ return torch.ops._C_custom_ar.allocate_shared_buffer_and_handle(size)
2600
+
2601
+
2602
+ def open_mem_handle(mem_handle: torch.Tensor):
2603
+ return torch.ops._C_custom_ar.open_mem_handle(mem_handle)
2604
+
2605
+
2606
+ def free_shared_buffer(ptr: int) -> None:
2607
+ torch.ops._C_custom_ar.free_shared_buffer(ptr)
2608
+
2609
+
2610
+ # quick all reduce
2611
+ def init_custom_qr(rank: int, world_size: int, qr_max_size: int | None = None) -> int:
2612
+ return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size)
2613
+
2614
+
2615
+ def qr_destroy(fa: int) -> None:
2616
+ torch.ops._C_custom_ar.qr_destroy(fa)
2617
+
2618
+
2619
+ def qr_all_reduce(
2620
+ fa: int,
2621
+ inp: torch.Tensor,
2622
+ out: torch.Tensor,
2623
+ quant_level: int,
2624
+ cast_bf2half: bool = False,
2625
+ ) -> None:
2626
+ torch.ops._C_custom_ar.qr_all_reduce(fa, inp, out, quant_level, cast_bf2half)
2627
+
2628
+
2629
+ def qr_get_handle(fa: int) -> torch.Tensor:
2630
+ return torch.ops._C_custom_ar.qr_get_handle(fa)
2631
+
2632
+
2633
+ def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None:
2634
+ return torch.ops._C_custom_ar.qr_open_handles(fa, handles)
2635
+
2636
+
2637
+ def qr_max_size() -> int:
2638
+ return torch.ops._C_custom_ar.qr_max_size()
2639
+
2640
+
2641
+ def get_flash_mla_metadata(
2642
+ cache_seqlens: torch.Tensor,
2643
+ num_heads_per_head_k: int,
2644
+ num_heads_k: int,
2645
+ ) -> tuple[torch.Tensor, torch.Tensor]:
2646
+ """
2647
+ Arguments:
2648
+ cache_seqlens: (batch_size), dtype torch.int32.
2649
+ num_heads_per_head_k: Equals to seq_len_q * num_heads_q // num_heads_k.
2650
+ num_heads_k: num_heads_k.
2651
+
2652
+ Return:
2653
+ tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32.
2654
+ num_splits: (batch_size + 1), dtype torch.int32.
2655
+ """
2656
+ return torch.ops._C.get_flash_mla_metadata(
2657
+ cache_seqlens, num_heads_per_head_k, num_heads_k
2658
+ )
2659
+
2660
+
2661
+ def flash_mla_with_kvcache(
2662
+ q: torch.Tensor,
2663
+ k_cache: torch.Tensor,
2664
+ block_table: torch.Tensor,
2665
+ cache_seqlens: torch.Tensor,
2666
+ head_dim_v: int,
2667
+ tile_scheduler_metadata: torch.Tensor,
2668
+ num_splits: torch.Tensor,
2669
+ softmax_scale: float | None = None,
2670
+ causal: bool = False,
2671
+ ) -> tuple[torch.Tensor, torch.Tensor]:
2672
+ """
2673
+ Arguments:
2674
+ q: (batch_size, seq_len_q, num_heads_q, head_dim).
2675
+ k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
2676
+ block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
2677
+ cache_seqlens: (batch_size), torch.int32.
2678
+ head_dim_v: Head_dim of v.
2679
+ tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, return by get_mla_metadata.
2680
+ num_splits: (batch_size + 1), torch.int32, return by get_mla_metadata.
2681
+ softmax_scale: float. The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim).
2682
+ causal: bool. Whether to apply causal attention mask.
2683
+
2684
+ Return:
2685
+ out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
2686
+ softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
2687
+ """
2688
+ if softmax_scale is None:
2689
+ softmax_scale = q.shape[-1] ** (-0.5)
2690
+ out, softmax_lse = torch.ops._C.flash_mla_fwd_kvcache(
2691
+ q,
2692
+ k_cache,
2693
+ None,
2694
+ head_dim_v,
2695
+ cache_seqlens,
2696
+ block_table,
2697
+ softmax_scale,
2698
+ causal,
2699
+ tile_scheduler_metadata,
2700
+ num_splits,
2701
+ )
2702
+ return out, softmax_lse
2703
+
2704
+
2705
+ def sm100_cutlass_mla_decode(
2706
+ out: torch.Tensor,
2707
+ lse: torch.Tensor,
2708
+ q_nope: torch.Tensor,
2709
+ q_pe: torch.Tensor,
2710
+ kv_c_and_k_pe_cache: torch.Tensor,
2711
+ seq_lens: torch.Tensor,
2712
+ page_table: torch.Tensor,
2713
+ workspace: torch.Tensor,
2714
+ scale: float,
2715
+ num_kv_splits: int,
2716
+ ) -> torch.Tensor:
2717
+ torch.ops._C.sm100_cutlass_mla_decode(
2718
+ out,
2719
+ lse,
2720
+ q_nope,
2721
+ q_pe,
2722
+ kv_c_and_k_pe_cache,
2723
+ seq_lens,
2724
+ page_table,
2725
+ workspace,
2726
+ scale,
2727
+ num_kv_splits,
2728
+ )
2729
+ return out
2730
+
2731
+
2732
+ def sm100_cutlass_mla_get_workspace_size(
2733
+ max_seq_len: int, num_batches: int, sm_count: int, num_kv_splits: int
2734
+ ) -> int:
2735
+ return torch.ops._C.sm100_cutlass_mla_get_workspace_size(
2736
+ max_seq_len, num_batches, sm_count, num_kv_splits
2737
+ )
2738
+
2739
+
2740
+ if hasattr(torch.ops._C, "weight_packed_linear"):
2741
+
2742
+ @register_fake("_C::weight_packed_linear")
2743
+ def weight_packed_linear_fake(
2744
+ mat1: torch.Tensor,
2745
+ mat2: torch.Tensor,
2746
+ bias: torch.Tensor | None,
2747
+ is_vnni: bool,
2748
+ ) -> torch.Tensor:
2749
+ return torch.empty(
2750
+ (mat1.size(0), mat2.size(0)), dtype=mat1.dtype, device=mat2.device
2751
+ )
2752
+
2753
+
2754
+ if hasattr(torch.ops._C, "fused_experts_cpu"):
2755
+
2756
+ @register_fake("_C::fused_experts_cpu")
2757
+ def fused_experts_cpu_fake(
2758
+ hidden_states: torch.Tensor,
2759
+ w1: torch.Tensor,
2760
+ w2: torch.Tensor,
2761
+ topk_weights: torch.Tensor,
2762
+ topk_ids: torch.Tensor,
2763
+ inplace: bool,
2764
+ use_int8_w8a8: bool,
2765
+ use_fp8_w8a16: bool,
2766
+ w1_scale: torch.Tensor | None,
2767
+ w2_scale: torch.Tensor | None,
2768
+ block_size: list[int] | None,
2769
+ a1_scale: torch.Tensor | None,
2770
+ a2_scale: torch.Tensor | None,
2771
+ is_vnni: bool,
2772
+ ) -> torch.Tensor:
2773
+ return torch.empty_like(hidden_states)
2774
+
2775
+
2776
+ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
2777
+
2778
+ @register_fake("_C::int8_scaled_mm_with_quant")
2779
+ def int8_scaled_mm_with_quant_fake(
2780
+ mat1: torch.Tensor,
2781
+ mat2: torch.Tensor,
2782
+ scales2: torch.Tensor,
2783
+ bias: torch.Tensor | None,
2784
+ out_dtype: torch.dtype,
2785
+ is_vnni: bool,
2786
+ ) -> torch.Tensor:
2787
+ M = mat1.size(0)
2788
+ N = mat2.size(0)
2789
+ return torch.empty((M, N), dtype=out_dtype)
2790
+
2791
+
2792
+ class CPUDNNLGEMMHandler:
2793
+ def __init__(self) -> None:
2794
+ self.handler: int | None = None
2795
+ self.n = -1
2796
+ self.k = -1
2797
+
2798
+ def __del__(self):
2799
+ if self.handler is not None:
2800
+ torch.ops._C.release_dnnl_matmul_handler(self.handler)
2801
+
2802
+
2803
+ _supports_onednn = bool(hasattr(torch.ops._C, "create_onednn_mm_handler"))
2804
+
2805
+
2806
+ def is_onednn_acl_supported():
2807
+ return torch.ops._C.is_onednn_acl_supported()
2808
+
2809
+
2810
+ def create_onednn_mm(
2811
+ weight: torch.Tensor, # [K, N]
2812
+ primitive_cache_size: int = 128,
2813
+ ) -> CPUDNNLGEMMHandler:
2814
+ handler = CPUDNNLGEMMHandler()
2815
+ handler.k, handler.n = weight.size()
2816
+ handler.handler = torch.ops._C.create_onednn_mm_handler(
2817
+ weight, primitive_cache_size
2818
+ )
2819
+ return handler
2820
+
2821
+
2822
+ def onednn_mm(
2823
+ dnnl_handler: CPUDNNLGEMMHandler,
2824
+ x: torch.Tensor,
2825
+ bias: torch.Tensor | None,
2826
+ ) -> torch.Tensor:
2827
+ output = torch.empty((*x.shape[0:-1], dnnl_handler.n), dtype=x.dtype)
2828
+ torch.ops._C.onednn_mm(
2829
+ output, x.reshape(-1, dnnl_handler.k), bias, dnnl_handler.handler
2830
+ )
2831
+
2832
+ return output
2833
+
2834
+
2835
+ def create_onednn_scaled_mm(
2836
+ weight: torch.Tensor, # [K, N]
2837
+ weight_scales: torch.Tensor,
2838
+ output_type: torch.dtype,
2839
+ dynamic_quant: bool,
2840
+ use_azp: bool,
2841
+ primitive_cache_size: int = 128,
2842
+ ) -> CPUDNNLGEMMHandler:
2843
+ handler = CPUDNNLGEMMHandler()
2844
+ handler.k, handler.n = weight.size()
2845
+ handler.handler = torch.ops._C.create_onednn_scaled_mm_handler(
2846
+ weight, weight_scales, output_type, dynamic_quant, use_azp, primitive_cache_size
2847
+ )
2848
+ return handler
2849
+
2850
+
2851
+ def onednn_scaled_int8_quant(
2852
+ input: torch.Tensor,
2853
+ scale: torch.Tensor | None = None,
2854
+ azp: torch.Tensor | None = None,
2855
+ symmetric: bool = True,
2856
+ ):
2857
+ """
2858
+ Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
2859
+
2860
+ Args:
2861
+ input: The input tensor to be quantized to int8.
2862
+ scale: Optional scaling factor for the int8 quantization.
2863
+ When not provided, we invoke dynamic-per-token quantization.
2864
+ azp: Optional zero-point for the int8 quantization.
2865
+ Must be provided for asymmetric quantization if `scale` is provided.
2866
+ symmetric: Whether to use symmetric quantization (scale only, azp ignored).
2867
+
2868
+ Returns:
2869
+ tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
2870
+ """
2871
+ output = torch.empty_like(input, dtype=torch.int8)
2872
+ token_num = input.numel() // input.shape[-1]
2873
+ input = input.view((token_num, input.shape[-1]))
2874
+ if scale is not None:
2875
+ # static-per-tensor quantization.
2876
+ assert symmetric == (azp is None), (
2877
+ "azp must only be provided for asymmetric quantization."
2878
+ )
2879
+ torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
2880
+ return output, scale, azp
2881
+
2882
+ # dynamic-per-token quantization.
2883
+ input_scales = torch.empty((token_num, 1), device=input.device, dtype=torch.float32)
2884
+ input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32)
2885
+ torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, input_azp)
2886
+ return output, input_scales, input_azp
2887
+
2888
+
2889
+ def onednn_scaled_mm(
2890
+ dnnl_handler: CPUDNNLGEMMHandler,
2891
+ x: torch.Tensor,
2892
+ output: torch.Tensor,
2893
+ input_scale: torch.Tensor | None,
2894
+ input_zp: torch.Tensor | None,
2895
+ input_zp_adj: torch.Tensor | None,
2896
+ bias: torch.Tensor | None,
2897
+ ) -> torch.Tensor:
2898
+ torch.ops._C.onednn_scaled_mm(
2899
+ output, x, input_scale, input_zp, input_zp_adj, bias, dnnl_handler.handler
2900
+ )
2901
+
2902
+ return output
2903
+
2904
+
2905
+ def cpu_attn_get_scheduler_metadata(
2906
+ num_reqs: int,
2907
+ num_heads: int,
2908
+ num_kv_heads: int,
2909
+ head_dim: int,
2910
+ seq_lens: torch.Tensor,
2911
+ dtype: torch.dtype,
2912
+ query_start_loc: torch.Tensor,
2913
+ causal: bool,
2914
+ sliding_window_size: int,
2915
+ isa: str,
2916
+ enable_kv_split: bool,
2917
+ ) -> torch.Tensor:
2918
+ sheduler_metadata = torch.ops._C.get_scheduler_metadata(
2919
+ num_reqs,
2920
+ num_heads,
2921
+ num_kv_heads,
2922
+ head_dim,
2923
+ seq_lens,
2924
+ dtype,
2925
+ query_start_loc,
2926
+ causal,
2927
+ sliding_window_size,
2928
+ isa,
2929
+ enable_kv_split,
2930
+ )
2931
+ return sheduler_metadata
2932
+
2933
+
2934
+ def cpu_attn_reshape_and_cache(
2935
+ key: torch.Tensor,
2936
+ value: torch.Tensor,
2937
+ key_cache: torch.Tensor,
2938
+ value_cache: torch.Tensor,
2939
+ slot_mapping: torch.Tensor,
2940
+ isa: str,
2941
+ ) -> None:
2942
+ torch.ops._C.cpu_attn_reshape_and_cache(
2943
+ key,
2944
+ value,
2945
+ key_cache,
2946
+ value_cache,
2947
+ slot_mapping,
2948
+ isa,
2949
+ )
2950
+
2951
+
2952
+ def cpu_attention_with_kv_cache(
2953
+ query: torch.Tensor,
2954
+ key_cache: torch.Tensor,
2955
+ value_cache: torch.Tensor,
2956
+ output: torch.Tensor,
2957
+ query_start_loc: torch.Tensor,
2958
+ seq_lens: torch.Tensor,
2959
+ scale: float,
2960
+ causal: bool,
2961
+ alibi_slopes: torch.Tensor | None,
2962
+ sliding_window: tuple[int, int],
2963
+ block_table: torch.Tensor,
2964
+ softcap: float,
2965
+ scheduler_metadata: torch.Tensor,
2966
+ s_aux: torch.Tensor | None,
2967
+ ) -> None:
2968
+ torch.ops._C.cpu_attention_with_kv_cache(
2969
+ query,
2970
+ key_cache,
2971
+ value_cache,
2972
+ output,
2973
+ query_start_loc,
2974
+ seq_lens,
2975
+ scale,
2976
+ causal,
2977
+ alibi_slopes,
2978
+ sliding_window[0],
2979
+ sliding_window[1],
2980
+ block_table,
2981
+ softcap,
2982
+ scheduler_metadata,
2983
+ s_aux,
2984
+ )
2985
+
2986
+
2987
+ def cpu_gemm_wna16(
2988
+ input: torch.Tensor,
2989
+ q_weight: torch.Tensor,
2990
+ scales: torch.Tensor,
2991
+ zeros: torch.Tensor | None,
2992
+ g_idx: torch.Tensor | None,
2993
+ bias: torch.Tensor | None,
2994
+ pack_factor: int,
2995
+ isa_hint: str,
2996
+ ) -> torch.Tensor:
2997
+ output = torch.empty((input.size(0), scales.size(1)), dtype=input.dtype)
2998
+ torch.ops._C.cpu_gemm_wna16(
2999
+ input,
3000
+ q_weight,
3001
+ output,
3002
+ scales,
3003
+ zeros,
3004
+ g_idx,
3005
+ bias,
3006
+ pack_factor,
3007
+ isa_hint,
3008
+ )
3009
+ return output
3010
+
3011
+
3012
+ def cpu_prepack_moe_weight(
3013
+ weight: torch.Tensor,
3014
+ isa: str,
3015
+ ) -> torch.Tensor:
3016
+ output = torch.empty_like(weight)
3017
+ torch.ops._C.prepack_moe_weight(weight, output, isa)
3018
+ return output
3019
+
3020
+
3021
+ def cpu_fused_moe(
3022
+ input: torch.Tensor,
3023
+ w13: torch.Tensor,
3024
+ w2: torch.Tensor,
3025
+ w13_bias: torch.Tensor | None,
3026
+ w2_bias: torch.Tensor | None,
3027
+ topk_weights: torch.Tensor,
3028
+ topk_ids: torch.Tensor,
3029
+ act: str,
3030
+ isa: str,
3031
+ ) -> torch.Tensor:
3032
+ output = torch.empty_like(input)
3033
+ torch.ops._C.cpu_fused_moe(
3034
+ output,
3035
+ input,
3036
+ w13,
3037
+ w2,
3038
+ w13_bias,
3039
+ w2_bias,
3040
+ topk_weights,
3041
+ topk_ids,
3042
+ act,
3043
+ isa,
3044
+ )
3045
+ return output
3046
+
3047
+
3048
+ if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
3049
+
3050
+ @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
3051
+ def _fake_matmul_mxf4_bf16_tn(
3052
+ a: torch.Tensor,
3053
+ b: torch.Tensor,
3054
+ a_sf: torch.Tensor,
3055
+ b_sf: torch.Tensor,
3056
+ alpha: torch.Tensor,
3057
+ ):
3058
+ return a.new_empty(*a.shape[:-1], b.shape[0], dtype=torch.bfloat16)
3059
+
3060
+
3061
+ def matmul_mxf4_bf16_tn(
3062
+ a: torch.Tensor,
3063
+ b: torch.Tensor,
3064
+ a_sf: torch.Tensor,
3065
+ b_sf: torch.Tensor,
3066
+ alpha: torch.Tensor,
3067
+ ) -> torch.Tensor:
3068
+ return torch.ops._qutlass_C.matmul_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)
3069
+
3070
+
3071
+ if hasattr(torch.ops._qutlass_C, "matmul_ada_mxf4_bf16_tn"):
3072
+
3073
+ @register_fake("_qutlass_C::matmul_ada_mxf4_bf16_tn")
3074
+ def _fake_matmul_ada_mxf4_bf16_tn(
3075
+ a: torch.Tensor,
3076
+ b: torch.Tensor,
3077
+ a_sf: torch.Tensor,
3078
+ b_sf: torch.Tensor,
3079
+ alpha: torch.Tensor,
3080
+ ):
3081
+ return a.new_empty(*a.shape[:-1], b.shape[0], dtype=torch.bfloat16)
3082
+
3083
+
3084
+ def matmul_ada_mxf4_bf16_tn(
3085
+ a: torch.Tensor,
3086
+ b: torch.Tensor,
3087
+ a_sf: torch.Tensor,
3088
+ b_sf: torch.Tensor,
3089
+ alpha: torch.Tensor,
3090
+ ) -> torch.Tensor:
3091
+ return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha)
3092
+
3093
+
3094
+ def ceil_div(a, b):
3095
+ return (a + b - 1) // b
3096
+
3097
+
3098
+ if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"):
3099
+
3100
+ @register_fake("_qutlass_C::fusedQuantizeMxQuest")
3101
+ def _fake_fused_quantize_mx_quest(
3102
+ a: torch.Tensor, b: torch.Tensor, xh_e2m1: torch.Tensor, xh_e8m0: torch.Tensor
3103
+ ):
3104
+ return xh_e2m1, xh_e8m0
3105
+
3106
+
3107
+ if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxAbsMax"):
3108
+
3109
+ @register_fake("_qutlass_C::fusedQuantizeMxAbsMax")
3110
+ def _fake_fused_quantize_mx_absmax(
3111
+ a: torch.Tensor, b: torch.Tensor, xh_e2m1: torch.Tensor, xh_e8m0: torch.Tensor
3112
+ ):
3113
+ return xh_e2m1, xh_e8m0
3114
+
3115
+
3116
+ def fusedQuantizeMx(
3117
+ a: torch.Tensor, b: torch.Tensor, *, method: Literal["quest", "abs_max"] = "quest"
3118
+ ) -> tuple[torch.Tensor, torch.Tensor]:
3119
+ if a.dim() == 0:
3120
+ raise ValueError("`a` must have at least 1 dimension.")
3121
+ if a.size(-1) % 32 != 0:
3122
+ raise ValueError(f"last dim of `a` must be divisible by 32, got {a.size(-1)}.")
3123
+ if b.device != a.device:
3124
+ raise ValueError("`a` and `b` must be on the same device.")
3125
+
3126
+ xh_e2m1 = torch.empty(
3127
+ *a.shape[:-1], a.size(-1) // 2, dtype=torch.uint8, device=a.device
3128
+ )
3129
+
3130
+ rows, cols = a.numel() // a.size(-1), a.size(-1) // 32
3131
+ n_row_blocks = ceil_div(rows, 128)
3132
+ n_col_blocks = ceil_div(cols, 4)
3133
+ padded_rows = n_row_blocks * 128
3134
+ padded_cols = n_col_blocks * 4
3135
+
3136
+ xh_e8m0 = torch.empty(
3137
+ padded_rows, padded_cols, dtype=torch.float8_e8m0fnu, device=a.device
3138
+ )
3139
+
3140
+ if not hasattr(torch.ops, "_qutlass_C"):
3141
+ raise RuntimeError(
3142
+ "The `_qutlass_C` extension is not loaded. "
3143
+ "Make sure your custom op library is imported before calling fusedQuantizeMx."
3144
+ )
3145
+
3146
+ if method == "quest":
3147
+ return torch.ops._qutlass_C.fusedQuantizeMxQuest(a, b, xh_e2m1, xh_e8m0)
3148
+ elif method == "abs_max":
3149
+ return torch.ops._qutlass_C.fusedQuantizeMxAbsMax(a, b, xh_e2m1, xh_e8m0)
3150
+ else:
3151
+ raise ValueError(f"invalid method {method!r}, must be 'quest' or 'abs_max'")
3152
+
3153
+
3154
+ if hasattr(torch.ops._qutlass_C, "fusedQuantizeNv"):
3155
+
3156
+ @register_fake("_qutlass_C::fusedQuantizeNv")
3157
+ def _fake_fused_quantize_nv(
3158
+ a: torch.Tensor,
3159
+ b: torch.Tensor,
3160
+ xh_e2m1: torch.Tensor,
3161
+ xh_e4m3: torch.Tensor,
3162
+ global_scale: torch.Tensor,
3163
+ ):
3164
+ return xh_e2m1, xh_e4m3
3165
+
3166
+
3167
+ def fusedQuantizeNv(
3168
+ a: torch.Tensor, b: torch.Tensor, global_scale: torch.Tensor
3169
+ ) -> tuple[torch.Tensor, torch.Tensor]:
3170
+ xh_e2m1 = torch.empty(
3171
+ *a.shape[:-1], a.size(-1) // 2, dtype=torch.uint8, device=a.device
3172
+ )
3173
+
3174
+ rows, cols = a.numel() // a.size(-1), a.size(-1) // 16
3175
+ n_row_blocks = ceil_div(rows, 128)
3176
+ n_col_blocks = ceil_div(cols, 4)
3177
+ padded_rows = n_row_blocks * 128
3178
+ padded_cols = n_col_blocks * 4
3179
+ xh_e4m3 = torch.empty(
3180
+ padded_rows, padded_cols, dtype=torch.float8_e4m3fn, device=a.device
3181
+ )
3182
+
3183
+ return torch.ops._qutlass_C.fusedQuantizeNv(a, b, xh_e2m1, xh_e4m3, global_scale)
3184
+
3185
+
3186
+ def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor:
3187
+ """
3188
+ Perform Hadamard transforms using [Hadacore](https://arxiv.org/abs/2412.08832)
3189
+ kernels. Note that these kernels exploit the recursive properties of
3190
+ Sylvester Hadamards, and therefore do not require transform weight data
3191
+
3192
+ Note that sylvester hadamard transforms are also symmetric, which means that
3193
+ this function is also applies the (transpose <=> inverse) transform.
3194
+
3195
+ :param x: value to be transformed inplace
3196
+ :param inplace: modify value in place
3197
+ :return: value after transformation
3198
+ """
3199
+ return torch.ops._C.hadacore_transform(x, inplace)
3200
+
3201
+
3202
+ if hasattr(torch.ops._C, "hadacore_transform"):
3203
+
3204
+ @register_fake("_C::hadacore_transform")
3205
+ def _hadacore_transform_fake(x: torch.Tensor, inplace: bool) -> torch.Tensor:
3206
+ return torch.empty_like(x) if not inplace else x