vllm-cpu 0.12.0__cp313-cp313-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1600) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +107 -0
  3. vllm/_aiter_ops.py +1018 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +2925 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +434 -0
  16. vllm/attention/backends/registry.py +286 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +975 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +120 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/ops/__init__.py +0 -0
  24. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  25. vllm/attention/ops/common.py +469 -0
  26. vllm/attention/ops/flashmla.py +251 -0
  27. vllm/attention/ops/merge_attn_states.py +47 -0
  28. vllm/attention/ops/paged_attn.py +51 -0
  29. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  30. vllm/attention/ops/prefix_prefill.py +814 -0
  31. vllm/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  32. vllm/attention/ops/triton_decode_attention.py +712 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +116 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  35. vllm/attention/ops/triton_unified_attention.py +941 -0
  36. vllm/attention/ops/vit_attn_wrappers.py +136 -0
  37. vllm/attention/selector.py +268 -0
  38. vllm/attention/utils/__init__.py +0 -0
  39. vllm/attention/utils/fa_utils.py +117 -0
  40. vllm/attention/utils/kv_sharing_utils.py +33 -0
  41. vllm/attention/utils/kv_transfer_utils.py +60 -0
  42. vllm/beam_search.py +88 -0
  43. vllm/benchmarks/__init__.py +0 -0
  44. vllm/benchmarks/datasets.py +3222 -0
  45. vllm/benchmarks/latency.py +172 -0
  46. vllm/benchmarks/lib/__init__.py +3 -0
  47. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  48. vllm/benchmarks/lib/ready_checker.py +72 -0
  49. vllm/benchmarks/lib/utils.py +79 -0
  50. vllm/benchmarks/serve.py +1531 -0
  51. vllm/benchmarks/sweep/__init__.py +0 -0
  52. vllm/benchmarks/sweep/cli.py +41 -0
  53. vllm/benchmarks/sweep/param_sweep.py +91 -0
  54. vllm/benchmarks/sweep/plot.py +580 -0
  55. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  56. vllm/benchmarks/sweep/serve.py +448 -0
  57. vllm/benchmarks/sweep/serve_sla.py +492 -0
  58. vllm/benchmarks/sweep/server.py +114 -0
  59. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  60. vllm/benchmarks/sweep/utils.py +4 -0
  61. vllm/benchmarks/throughput.py +799 -0
  62. vllm/collect_env.py +857 -0
  63. vllm/compilation/__init__.py +0 -0
  64. vllm/compilation/activation_quant_fusion.py +209 -0
  65. vllm/compilation/backends.py +827 -0
  66. vllm/compilation/base_static_graph.py +57 -0
  67. vllm/compilation/caching.py +180 -0
  68. vllm/compilation/collective_fusion.py +1234 -0
  69. vllm/compilation/compiler_interface.py +639 -0
  70. vllm/compilation/counter.py +48 -0
  71. vllm/compilation/cuda_graph.py +208 -0
  72. vllm/compilation/decorators.py +614 -0
  73. vllm/compilation/fix_functionalization.py +253 -0
  74. vllm/compilation/fusion.py +374 -0
  75. vllm/compilation/fusion_attn.py +359 -0
  76. vllm/compilation/fx_utils.py +91 -0
  77. vllm/compilation/inductor_pass.py +133 -0
  78. vllm/compilation/matcher_utils.py +315 -0
  79. vllm/compilation/monitor.py +62 -0
  80. vllm/compilation/noop_elimination.py +134 -0
  81. vllm/compilation/partition_rules.py +72 -0
  82. vllm/compilation/pass_manager.py +136 -0
  83. vllm/compilation/piecewise_backend.py +121 -0
  84. vllm/compilation/post_cleanup.py +21 -0
  85. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  86. vllm/compilation/sequence_parallelism.py +363 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  88. vllm/compilation/vllm_inductor_pass.py +173 -0
  89. vllm/compilation/wrapper.py +260 -0
  90. vllm/config/__init__.py +102 -0
  91. vllm/config/cache.py +220 -0
  92. vllm/config/compilation.py +1154 -0
  93. vllm/config/device.py +75 -0
  94. vllm/config/ec_transfer.py +110 -0
  95. vllm/config/kv_events.py +56 -0
  96. vllm/config/kv_transfer.py +114 -0
  97. vllm/config/load.py +124 -0
  98. vllm/config/lora.py +96 -0
  99. vllm/config/model.py +2274 -0
  100. vllm/config/multimodal.py +247 -0
  101. vllm/config/observability.py +131 -0
  102. vllm/config/parallel.py +653 -0
  103. vllm/config/pooler.py +124 -0
  104. vllm/config/scheduler.py +297 -0
  105. vllm/config/speculative.py +643 -0
  106. vllm/config/speech_to_text.py +38 -0
  107. vllm/config/structured_outputs.py +94 -0
  108. vllm/config/utils.py +324 -0
  109. vllm/config/vllm.py +1353 -0
  110. vllm/connections.py +189 -0
  111. vllm/device_allocator/__init__.py +0 -0
  112. vllm/device_allocator/cumem.py +327 -0
  113. vllm/distributed/__init__.py +6 -0
  114. vllm/distributed/communication_op.py +43 -0
  115. vllm/distributed/device_communicators/__init__.py +0 -0
  116. vllm/distributed/device_communicators/all2all.py +490 -0
  117. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  118. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  119. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  120. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  121. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  122. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  123. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  124. vllm/distributed/device_communicators/pynccl.py +386 -0
  125. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  126. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  127. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  128. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  129. vllm/distributed/device_communicators/shm_broadcast.py +733 -0
  130. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  131. vllm/distributed/device_communicators/symm_mem.py +156 -0
  132. vllm/distributed/device_communicators/tpu_communicator.py +99 -0
  133. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  134. vllm/distributed/ec_transfer/__init__.py +14 -0
  135. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  136. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  137. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  138. vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +201 -0
  139. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  140. vllm/distributed/eplb/__init__.py +8 -0
  141. vllm/distributed/eplb/async_worker.py +115 -0
  142. vllm/distributed/eplb/eplb_state.py +1154 -0
  143. vllm/distributed/eplb/rebalance_algo.py +260 -0
  144. vllm/distributed/eplb/rebalance_execute.py +532 -0
  145. vllm/distributed/kv_events.py +371 -0
  146. vllm/distributed/kv_transfer/README.md +29 -0
  147. vllm/distributed/kv_transfer/__init__.py +20 -0
  148. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  150. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  151. vllm/distributed/kv_transfer/kv_connector/factory.py +192 -0
  152. vllm/distributed/kv_transfer/kv_connector/utils.py +268 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/base.py +575 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +216 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +378 -0
  159. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1411 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +895 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +189 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +454 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2480 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +538 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  169. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  170. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +450 -0
  171. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  172. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +179 -0
  173. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +164 -0
  174. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +242 -0
  175. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  176. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  177. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +295 -0
  178. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +285 -0
  179. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  180. vllm/distributed/parallel_state.py +1790 -0
  181. vllm/distributed/tpu_distributed_utils.py +188 -0
  182. vllm/distributed/utils.py +545 -0
  183. vllm/engine/__init__.py +0 -0
  184. vllm/engine/arg_utils.py +2106 -0
  185. vllm/engine/async_llm_engine.py +6 -0
  186. vllm/engine/llm_engine.py +6 -0
  187. vllm/engine/protocol.py +188 -0
  188. vllm/entrypoints/__init__.py +0 -0
  189. vllm/entrypoints/anthropic/__init__.py +0 -0
  190. vllm/entrypoints/anthropic/protocol.py +162 -0
  191. vllm/entrypoints/anthropic/serving_messages.py +460 -0
  192. vllm/entrypoints/api_server.py +184 -0
  193. vllm/entrypoints/chat_utils.py +1837 -0
  194. vllm/entrypoints/cli/__init__.py +13 -0
  195. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  196. vllm/entrypoints/cli/benchmark/base.py +25 -0
  197. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  198. vllm/entrypoints/cli/benchmark/main.py +56 -0
  199. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  200. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  201. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  202. vllm/entrypoints/cli/collect_env.py +38 -0
  203. vllm/entrypoints/cli/main.py +79 -0
  204. vllm/entrypoints/cli/openai.py +256 -0
  205. vllm/entrypoints/cli/run_batch.py +68 -0
  206. vllm/entrypoints/cli/serve.py +249 -0
  207. vllm/entrypoints/cli/types.py +29 -0
  208. vllm/entrypoints/constants.py +10 -0
  209. vllm/entrypoints/context.py +572 -0
  210. vllm/entrypoints/dynamic_lora.py +57 -0
  211. vllm/entrypoints/harmony_utils.py +535 -0
  212. vllm/entrypoints/launcher.py +175 -0
  213. vllm/entrypoints/llm.py +1762 -0
  214. vllm/entrypoints/logger.py +84 -0
  215. vllm/entrypoints/openai/__init__.py +0 -0
  216. vllm/entrypoints/openai/api_server.py +1891 -0
  217. vllm/entrypoints/openai/cli_args.py +302 -0
  218. vllm/entrypoints/openai/orca_metrics.py +120 -0
  219. vllm/entrypoints/openai/protocol.py +2465 -0
  220. vllm/entrypoints/openai/run_batch.py +631 -0
  221. vllm/entrypoints/openai/serving_chat.py +1782 -0
  222. vllm/entrypoints/openai/serving_completion.py +716 -0
  223. vllm/entrypoints/openai/serving_engine.py +1478 -0
  224. vllm/entrypoints/openai/serving_models.py +304 -0
  225. vllm/entrypoints/openai/serving_responses.py +2032 -0
  226. vllm/entrypoints/openai/serving_tokenization.py +203 -0
  227. vllm/entrypoints/openai/serving_tokens.py +281 -0
  228. vllm/entrypoints/openai/serving_transcription.py +168 -0
  229. vllm/entrypoints/openai/speech_to_text.py +559 -0
  230. vllm/entrypoints/openai/tool_parsers/__init__.py +142 -0
  231. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +273 -0
  232. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +390 -0
  233. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +390 -0
  234. vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +210 -0
  235. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +200 -0
  236. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  237. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +253 -0
  238. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +494 -0
  239. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  240. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +227 -0
  241. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +322 -0
  242. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +590 -0
  243. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  244. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +324 -0
  245. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +37 -0
  246. vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +643 -0
  247. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +849 -0
  248. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +390 -0
  249. vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py +366 -0
  250. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +97 -0
  251. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +120 -0
  252. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +332 -0
  253. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +781 -0
  254. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  255. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +744 -0
  256. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +303 -0
  257. vllm/entrypoints/openai/tool_parsers/utils.py +229 -0
  258. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +556 -0
  259. vllm/entrypoints/openai/utils.py +49 -0
  260. vllm/entrypoints/pooling/__init__.py +16 -0
  261. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  262. vllm/entrypoints/pooling/classify/api_router.py +50 -0
  263. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  264. vllm/entrypoints/pooling/classify/serving.py +237 -0
  265. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  266. vllm/entrypoints/pooling/embed/api_router.py +67 -0
  267. vllm/entrypoints/pooling/embed/protocol.py +208 -0
  268. vllm/entrypoints/pooling/embed/serving.py +697 -0
  269. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  270. vllm/entrypoints/pooling/pooling/api_router.py +63 -0
  271. vllm/entrypoints/pooling/pooling/protocol.py +148 -0
  272. vllm/entrypoints/pooling/pooling/serving.py +348 -0
  273. vllm/entrypoints/pooling/score/__init__.py +0 -0
  274. vllm/entrypoints/pooling/score/api_router.py +149 -0
  275. vllm/entrypoints/pooling/score/protocol.py +145 -0
  276. vllm/entrypoints/pooling/score/serving.py +505 -0
  277. vllm/entrypoints/renderer.py +409 -0
  278. vllm/entrypoints/responses_utils.py +148 -0
  279. vllm/entrypoints/sagemaker/__init__.py +4 -0
  280. vllm/entrypoints/sagemaker/routes.py +118 -0
  281. vllm/entrypoints/score_utils.py +240 -0
  282. vllm/entrypoints/ssl.py +78 -0
  283. vllm/entrypoints/tool.py +143 -0
  284. vllm/entrypoints/tool_server.py +234 -0
  285. vllm/entrypoints/utils.py +319 -0
  286. vllm/env_override.py +378 -0
  287. vllm/envs.py +1710 -0
  288. vllm/forward_context.py +358 -0
  289. vllm/inputs/__init__.py +44 -0
  290. vllm/inputs/data.py +359 -0
  291. vllm/inputs/parse.py +137 -0
  292. vllm/inputs/preprocess.py +716 -0
  293. vllm/logger.py +298 -0
  294. vllm/logging_utils/__init__.py +13 -0
  295. vllm/logging_utils/dump_input.py +83 -0
  296. vllm/logging_utils/formatter.py +127 -0
  297. vllm/logging_utils/lazy.py +20 -0
  298. vllm/logging_utils/log_time.py +34 -0
  299. vllm/logits_process.py +121 -0
  300. vllm/logprobs.py +206 -0
  301. vllm/lora/__init__.py +0 -0
  302. vllm/lora/layers/__init__.py +42 -0
  303. vllm/lora/layers/base.py +66 -0
  304. vllm/lora/layers/base_linear.py +165 -0
  305. vllm/lora/layers/column_parallel_linear.py +577 -0
  306. vllm/lora/layers/fused_moe.py +747 -0
  307. vllm/lora/layers/logits_processor.py +203 -0
  308. vllm/lora/layers/replicated_linear.py +70 -0
  309. vllm/lora/layers/row_parallel_linear.py +176 -0
  310. vllm/lora/layers/utils.py +74 -0
  311. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  312. vllm/lora/lora_weights.py +227 -0
  313. vllm/lora/models.py +903 -0
  314. vllm/lora/ops/__init__.py +0 -0
  315. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  316. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  317. vllm/lora/ops/torch_ops/__init__.py +20 -0
  318. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  319. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  320. vllm/lora/ops/triton_ops/__init__.py +21 -0
  321. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +661 -0
  322. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  323. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  324. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  325. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  326. vllm/lora/ops/triton_ops/utils.py +295 -0
  327. vllm/lora/ops/xla_ops/__init__.py +6 -0
  328. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  329. vllm/lora/peft_helper.py +128 -0
  330. vllm/lora/punica_wrapper/__init__.py +10 -0
  331. vllm/lora/punica_wrapper/punica_base.py +493 -0
  332. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  333. vllm/lora/punica_wrapper/punica_gpu.py +412 -0
  334. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  335. vllm/lora/punica_wrapper/punica_tpu.py +358 -0
  336. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  337. vllm/lora/punica_wrapper/utils.py +150 -0
  338. vllm/lora/request.py +100 -0
  339. vllm/lora/resolver.py +88 -0
  340. vllm/lora/utils.py +306 -0
  341. vllm/lora/worker_manager.py +268 -0
  342. vllm/model_executor/__init__.py +11 -0
  343. vllm/model_executor/custom_op.py +194 -0
  344. vllm/model_executor/layers/__init__.py +0 -0
  345. vllm/model_executor/layers/activation.py +595 -0
  346. vllm/model_executor/layers/attention_layer_base.py +32 -0
  347. vllm/model_executor/layers/batch_invariant.py +1058 -0
  348. vllm/model_executor/layers/conv.py +256 -0
  349. vllm/model_executor/layers/fla/__init__.py +8 -0
  350. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  351. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  352. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  353. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  354. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  355. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  356. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  357. vllm/model_executor/layers/fla/ops/index.py +41 -0
  358. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  359. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  360. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  361. vllm/model_executor/layers/fla/ops/op.py +60 -0
  362. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  363. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  364. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  365. vllm/model_executor/layers/fused_moe/__init__.py +110 -0
  366. vllm/model_executor/layers/fused_moe/all2all_utils.py +171 -0
  367. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +406 -0
  368. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +180 -0
  369. vllm/model_executor/layers/fused_moe/config.py +938 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  645. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +292 -0
  646. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1052 -0
  647. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +387 -0
  648. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +416 -0
  649. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  650. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +434 -0
  651. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +376 -0
  652. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  653. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  654. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  655. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  656. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +821 -0
  657. vllm/model_executor/layers/fused_moe/fused_moe.py +2172 -0
  658. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +121 -0
  659. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +136 -0
  660. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +524 -0
  661. vllm/model_executor/layers/fused_moe/layer.py +2152 -0
  662. vllm/model_executor/layers/fused_moe/modular_kernel.py +1332 -0
  663. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +174 -0
  664. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  665. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  666. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  667. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  668. vllm/model_executor/layers/fused_moe/prepare_finalize.py +78 -0
  669. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  670. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  671. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  672. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  673. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  674. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  675. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +559 -0
  676. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  677. vllm/model_executor/layers/kda.py +442 -0
  678. vllm/model_executor/layers/layernorm.py +442 -0
  679. vllm/model_executor/layers/lightning_attn.py +735 -0
  680. vllm/model_executor/layers/linear.py +1424 -0
  681. vllm/model_executor/layers/logits_processor.py +106 -0
  682. vllm/model_executor/layers/mamba/__init__.py +0 -0
  683. vllm/model_executor/layers/mamba/abstract.py +68 -0
  684. vllm/model_executor/layers/mamba/linear_attn.py +388 -0
  685. vllm/model_executor/layers/mamba/mamba_mixer.py +527 -0
  686. vllm/model_executor/layers/mamba/mamba_mixer2.py +930 -0
  687. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  688. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  689. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  690. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  691. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +478 -0
  692. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  693. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  694. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  695. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  696. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  697. vllm/model_executor/layers/mamba/short_conv.py +255 -0
  698. vllm/model_executor/layers/mla.py +176 -0
  699. vllm/model_executor/layers/pooler.py +817 -0
  700. vllm/model_executor/layers/quantization/__init__.py +179 -0
  701. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  702. vllm/model_executor/layers/quantization/awq.py +277 -0
  703. vllm/model_executor/layers/quantization/awq_marlin.py +718 -0
  704. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  705. vllm/model_executor/layers/quantization/base_config.py +170 -0
  706. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  707. vllm/model_executor/layers/quantization/bitsandbytes.py +644 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +963 -0
  710. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2387 -0
  711. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  712. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  713. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  714. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  715. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  716. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  717. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +183 -0
  718. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  719. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  720. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  721. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  722. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  723. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  724. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  725. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  726. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  727. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  728. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  729. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  730. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  731. vllm/model_executor/layers/quantization/cpu_wna16.py +625 -0
  732. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  733. vllm/model_executor/layers/quantization/experts_int8.py +225 -0
  734. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  735. vllm/model_executor/layers/quantization/fp8.py +1348 -0
  736. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  737. vllm/model_executor/layers/quantization/gguf.py +687 -0
  738. vllm/model_executor/layers/quantization/gptq.py +393 -0
  739. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  740. vllm/model_executor/layers/quantization/gptq_marlin.py +842 -0
  741. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  742. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  743. vllm/model_executor/layers/quantization/inc.py +65 -0
  744. vllm/model_executor/layers/quantization/input_quant_fp8.py +171 -0
  745. vllm/model_executor/layers/quantization/ipex_quant.py +470 -0
  746. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  747. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  748. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +105 -0
  749. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  750. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  751. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  752. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +119 -0
  753. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  754. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  755. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  756. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  757. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +73 -0
  758. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +97 -0
  759. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  760. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +219 -0
  761. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +140 -0
  762. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +42 -0
  763. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  764. vllm/model_executor/layers/quantization/kv_cache.py +146 -0
  765. vllm/model_executor/layers/quantization/modelopt.py +1637 -0
  766. vllm/model_executor/layers/quantization/moe_wna16.py +528 -0
  767. vllm/model_executor/layers/quantization/mxfp4.py +1175 -0
  768. vllm/model_executor/layers/quantization/petit.py +319 -0
  769. vllm/model_executor/layers/quantization/ptpc_fp8.py +136 -0
  770. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  771. vllm/model_executor/layers/quantization/quark/quark.py +527 -0
  772. vllm/model_executor/layers/quantization/quark/quark_moe.py +653 -0
  773. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  774. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  775. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  776. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  777. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  778. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  779. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  780. vllm/model_executor/layers/quantization/rtn.py +639 -0
  781. vllm/model_executor/layers/quantization/schema.py +90 -0
  782. vllm/model_executor/layers/quantization/torchao.py +380 -0
  783. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  784. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  785. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  786. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1002. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +333 -0
  1003. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +311 -0
  1004. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1203 -0
  1005. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1006. vllm/model_executor/layers/quantization/utils/int8_utils.py +489 -0
  1007. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1008. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1009. vllm/model_executor/layers/quantization/utils/marlin_utils.py +674 -0
  1010. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +452 -0
  1011. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1012. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1013. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1014. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +183 -0
  1015. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1016. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1017. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1018. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1019. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1020. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1021. vllm/model_executor/layers/quantization/utils/quant_utils.py +687 -0
  1022. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +516 -0
  1023. vllm/model_executor/layers/resampler.py +283 -0
  1024. vllm/model_executor/layers/rotary_embedding/__init__.py +292 -0
  1025. vllm/model_executor/layers/rotary_embedding/base.py +240 -0
  1026. vllm/model_executor/layers/rotary_embedding/common.py +188 -0
  1027. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1028. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1029. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1030. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1031. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +75 -0
  1032. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1033. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1034. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1035. vllm/model_executor/layers/rotary_embedding/mrope.py +397 -0
  1036. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1037. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1038. vllm/model_executor/layers/rotary_embedding/xdrope.py +102 -0
  1039. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1040. vllm/model_executor/layers/utils.py +251 -0
  1041. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1042. vllm/model_executor/model_loader/__init__.py +150 -0
  1043. vllm/model_executor/model_loader/base_loader.py +57 -0
  1044. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1045. vllm/model_executor/model_loader/default_loader.py +321 -0
  1046. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1047. vllm/model_executor/model_loader/gguf_loader.py +349 -0
  1048. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1049. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1050. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1051. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1052. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1053. vllm/model_executor/model_loader/tpu.py +118 -0
  1054. vllm/model_executor/model_loader/utils.py +296 -0
  1055. vllm/model_executor/model_loader/weight_utils.py +1147 -0
  1056. vllm/model_executor/models/__init__.py +44 -0
  1057. vllm/model_executor/models/adapters.py +543 -0
  1058. vllm/model_executor/models/afmoe.py +697 -0
  1059. vllm/model_executor/models/aimv2.py +248 -0
  1060. vllm/model_executor/models/apertus.py +569 -0
  1061. vllm/model_executor/models/arcee.py +428 -0
  1062. vllm/model_executor/models/arctic.py +634 -0
  1063. vllm/model_executor/models/aria.py +655 -0
  1064. vllm/model_executor/models/aya_vision.py +450 -0
  1065. vllm/model_executor/models/baichuan.py +494 -0
  1066. vllm/model_executor/models/bailing_moe.py +645 -0
  1067. vllm/model_executor/models/bamba.py +516 -0
  1068. vllm/model_executor/models/bee.py +157 -0
  1069. vllm/model_executor/models/bert.py +925 -0
  1070. vllm/model_executor/models/bert_with_rope.py +732 -0
  1071. vllm/model_executor/models/blip.py +350 -0
  1072. vllm/model_executor/models/blip2.py +695 -0
  1073. vllm/model_executor/models/bloom.py +390 -0
  1074. vllm/model_executor/models/chameleon.py +1098 -0
  1075. vllm/model_executor/models/chatglm.py +499 -0
  1076. vllm/model_executor/models/clip.py +1005 -0
  1077. vllm/model_executor/models/cohere2_vision.py +472 -0
  1078. vllm/model_executor/models/commandr.py +470 -0
  1079. vllm/model_executor/models/config.py +510 -0
  1080. vllm/model_executor/models/dbrx.py +485 -0
  1081. vllm/model_executor/models/deepencoder.py +676 -0
  1082. vllm/model_executor/models/deepseek_eagle.py +252 -0
  1083. vllm/model_executor/models/deepseek_mtp.py +446 -0
  1084. vllm/model_executor/models/deepseek_ocr.py +593 -0
  1085. vllm/model_executor/models/deepseek_v2.py +1715 -0
  1086. vllm/model_executor/models/deepseek_vl2.py +644 -0
  1087. vllm/model_executor/models/dots1.py +566 -0
  1088. vllm/model_executor/models/dots_ocr.py +874 -0
  1089. vllm/model_executor/models/ernie45.py +53 -0
  1090. vllm/model_executor/models/ernie45_moe.py +755 -0
  1091. vllm/model_executor/models/ernie45_vl.py +1710 -0
  1092. vllm/model_executor/models/ernie45_vl_moe.py +800 -0
  1093. vllm/model_executor/models/ernie_mtp.py +279 -0
  1094. vllm/model_executor/models/exaone.py +525 -0
  1095. vllm/model_executor/models/exaone4.py +517 -0
  1096. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1097. vllm/model_executor/models/falcon.py +544 -0
  1098. vllm/model_executor/models/falcon_h1.py +680 -0
  1099. vllm/model_executor/models/flex_olmo.py +155 -0
  1100. vllm/model_executor/models/fuyu.py +373 -0
  1101. vllm/model_executor/models/gemma.py +426 -0
  1102. vllm/model_executor/models/gemma2.py +436 -0
  1103. vllm/model_executor/models/gemma3.py +577 -0
  1104. vllm/model_executor/models/gemma3_mm.py +665 -0
  1105. vllm/model_executor/models/gemma3n.py +1167 -0
  1106. vllm/model_executor/models/gemma3n_mm.py +811 -0
  1107. vllm/model_executor/models/glm.py +23 -0
  1108. vllm/model_executor/models/glm4.py +298 -0
  1109. vllm/model_executor/models/glm4_1v.py +1854 -0
  1110. vllm/model_executor/models/glm4_moe.py +738 -0
  1111. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1112. vllm/model_executor/models/glm4v.py +785 -0
  1113. vllm/model_executor/models/gpt2.py +397 -0
  1114. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1115. vllm/model_executor/models/gpt_j.py +345 -0
  1116. vllm/model_executor/models/gpt_neox.py +343 -0
  1117. vllm/model_executor/models/gpt_oss.py +745 -0
  1118. vllm/model_executor/models/granite.py +476 -0
  1119. vllm/model_executor/models/granite_speech.py +913 -0
  1120. vllm/model_executor/models/granitemoe.py +561 -0
  1121. vllm/model_executor/models/granitemoehybrid.py +704 -0
  1122. vllm/model_executor/models/granitemoeshared.py +328 -0
  1123. vllm/model_executor/models/gritlm.py +245 -0
  1124. vllm/model_executor/models/grok1.py +555 -0
  1125. vllm/model_executor/models/h2ovl.py +554 -0
  1126. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1127. vllm/model_executor/models/hunyuan_vision.py +1028 -0
  1128. vllm/model_executor/models/hyperclovax_vision.py +1166 -0
  1129. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1130. vllm/model_executor/models/idefics3.py +718 -0
  1131. vllm/model_executor/models/interfaces.py +1148 -0
  1132. vllm/model_executor/models/interfaces_base.py +243 -0
  1133. vllm/model_executor/models/intern_vit.py +454 -0
  1134. vllm/model_executor/models/internlm2.py +454 -0
  1135. vllm/model_executor/models/internlm2_ve.py +139 -0
  1136. vllm/model_executor/models/interns1.py +830 -0
  1137. vllm/model_executor/models/interns1_vit.py +433 -0
  1138. vllm/model_executor/models/internvl.py +1452 -0
  1139. vllm/model_executor/models/jais.py +397 -0
  1140. vllm/model_executor/models/jamba.py +609 -0
  1141. vllm/model_executor/models/jina_vl.py +147 -0
  1142. vllm/model_executor/models/keye.py +1765 -0
  1143. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1144. vllm/model_executor/models/kimi_linear.py +658 -0
  1145. vllm/model_executor/models/kimi_vl.py +578 -0
  1146. vllm/model_executor/models/lfm2.py +516 -0
  1147. vllm/model_executor/models/lfm2_moe.py +746 -0
  1148. vllm/model_executor/models/lightonocr.py +195 -0
  1149. vllm/model_executor/models/llama.py +704 -0
  1150. vllm/model_executor/models/llama4.py +857 -0
  1151. vllm/model_executor/models/llama4_eagle.py +216 -0
  1152. vllm/model_executor/models/llama_eagle.py +213 -0
  1153. vllm/model_executor/models/llama_eagle3.py +375 -0
  1154. vllm/model_executor/models/llava.py +842 -0
  1155. vllm/model_executor/models/llava_next.py +583 -0
  1156. vllm/model_executor/models/llava_next_video.py +467 -0
  1157. vllm/model_executor/models/llava_onevision.py +923 -0
  1158. vllm/model_executor/models/longcat_flash.py +743 -0
  1159. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1160. vllm/model_executor/models/mamba.py +276 -0
  1161. vllm/model_executor/models/mamba2.py +288 -0
  1162. vllm/model_executor/models/medusa.py +179 -0
  1163. vllm/model_executor/models/midashenglm.py +828 -0
  1164. vllm/model_executor/models/mimo.py +188 -0
  1165. vllm/model_executor/models/mimo_mtp.py +294 -0
  1166. vllm/model_executor/models/minicpm.py +657 -0
  1167. vllm/model_executor/models/minicpm3.py +234 -0
  1168. vllm/model_executor/models/minicpm_eagle.py +385 -0
  1169. vllm/model_executor/models/minicpmo.py +768 -0
  1170. vllm/model_executor/models/minicpmv.py +1744 -0
  1171. vllm/model_executor/models/minimax_m2.py +546 -0
  1172. vllm/model_executor/models/minimax_text_01.py +1010 -0
  1173. vllm/model_executor/models/minimax_vl_01.py +396 -0
  1174. vllm/model_executor/models/mistral3.py +637 -0
  1175. vllm/model_executor/models/mistral_large_3.py +63 -0
  1176. vllm/model_executor/models/mistral_large_3_eagle.py +165 -0
  1177. vllm/model_executor/models/mixtral.py +599 -0
  1178. vllm/model_executor/models/mllama4.py +1151 -0
  1179. vllm/model_executor/models/mlp_speculator.py +235 -0
  1180. vllm/model_executor/models/modernbert.py +452 -0
  1181. vllm/model_executor/models/module_mapping.py +74 -0
  1182. vllm/model_executor/models/molmo.py +1553 -0
  1183. vllm/model_executor/models/moonvit.py +686 -0
  1184. vllm/model_executor/models/mpt.py +335 -0
  1185. vllm/model_executor/models/nano_nemotron_vl.py +1732 -0
  1186. vllm/model_executor/models/nemotron.py +502 -0
  1187. vllm/model_executor/models/nemotron_h.py +850 -0
  1188. vllm/model_executor/models/nemotron_nas.py +473 -0
  1189. vllm/model_executor/models/nemotron_vl.py +653 -0
  1190. vllm/model_executor/models/nvlm_d.py +216 -0
  1191. vllm/model_executor/models/olmo.py +413 -0
  1192. vllm/model_executor/models/olmo2.py +455 -0
  1193. vllm/model_executor/models/olmoe.py +494 -0
  1194. vllm/model_executor/models/opencua.py +271 -0
  1195. vllm/model_executor/models/openpangu.py +1051 -0
  1196. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1197. vllm/model_executor/models/opt.py +426 -0
  1198. vllm/model_executor/models/orion.py +366 -0
  1199. vllm/model_executor/models/ouro.py +508 -0
  1200. vllm/model_executor/models/ovis.py +559 -0
  1201. vllm/model_executor/models/ovis2_5.py +673 -0
  1202. vllm/model_executor/models/paddleocr_vl.py +1380 -0
  1203. vllm/model_executor/models/paligemma.py +412 -0
  1204. vllm/model_executor/models/persimmon.py +376 -0
  1205. vllm/model_executor/models/phi.py +370 -0
  1206. vllm/model_executor/models/phi3.py +18 -0
  1207. vllm/model_executor/models/phi3v.py +737 -0
  1208. vllm/model_executor/models/phi4_multimodal.py +1447 -0
  1209. vllm/model_executor/models/phi4mm.py +1253 -0
  1210. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1211. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1212. vllm/model_executor/models/phimoe.py +670 -0
  1213. vllm/model_executor/models/pixtral.py +1380 -0
  1214. vllm/model_executor/models/plamo2.py +966 -0
  1215. vllm/model_executor/models/plamo3.py +441 -0
  1216. vllm/model_executor/models/qwen.py +363 -0
  1217. vllm/model_executor/models/qwen2.py +569 -0
  1218. vllm/model_executor/models/qwen2_5_omni_thinker.py +1220 -0
  1219. vllm/model_executor/models/qwen2_5_vl.py +1594 -0
  1220. vllm/model_executor/models/qwen2_audio.py +473 -0
  1221. vllm/model_executor/models/qwen2_moe.py +590 -0
  1222. vllm/model_executor/models/qwen2_rm.py +123 -0
  1223. vllm/model_executor/models/qwen2_vl.py +1593 -0
  1224. vllm/model_executor/models/qwen3.py +332 -0
  1225. vllm/model_executor/models/qwen3_moe.py +738 -0
  1226. vllm/model_executor/models/qwen3_next.py +1390 -0
  1227. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1228. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1765 -0
  1229. vllm/model_executor/models/qwen3_vl.py +1686 -0
  1230. vllm/model_executor/models/qwen3_vl_moe.py +470 -0
  1231. vllm/model_executor/models/qwen_vl.py +803 -0
  1232. vllm/model_executor/models/radio.py +555 -0
  1233. vllm/model_executor/models/registry.py +1183 -0
  1234. vllm/model_executor/models/roberta.py +259 -0
  1235. vllm/model_executor/models/rvl.py +107 -0
  1236. vllm/model_executor/models/seed_oss.py +493 -0
  1237. vllm/model_executor/models/siglip.py +1245 -0
  1238. vllm/model_executor/models/siglip2navit.py +723 -0
  1239. vllm/model_executor/models/skyworkr1v.py +953 -0
  1240. vllm/model_executor/models/smolvlm.py +38 -0
  1241. vllm/model_executor/models/solar.py +485 -0
  1242. vllm/model_executor/models/stablelm.py +359 -0
  1243. vllm/model_executor/models/starcoder2.py +366 -0
  1244. vllm/model_executor/models/step3_text.py +555 -0
  1245. vllm/model_executor/models/step3_vl.py +1149 -0
  1246. vllm/model_executor/models/swin.py +514 -0
  1247. vllm/model_executor/models/tarsier.py +619 -0
  1248. vllm/model_executor/models/telechat2.py +153 -0
  1249. vllm/model_executor/models/teleflm.py +78 -0
  1250. vllm/model_executor/models/terratorch.py +319 -0
  1251. vllm/model_executor/models/transformers/__init__.py +127 -0
  1252. vllm/model_executor/models/transformers/base.py +464 -0
  1253. vllm/model_executor/models/transformers/causal.py +65 -0
  1254. vllm/model_executor/models/transformers/legacy.py +90 -0
  1255. vllm/model_executor/models/transformers/moe.py +325 -0
  1256. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1257. vllm/model_executor/models/transformers/pooling.py +119 -0
  1258. vllm/model_executor/models/transformers/utils.py +213 -0
  1259. vllm/model_executor/models/ultravox.py +686 -0
  1260. vllm/model_executor/models/utils.py +832 -0
  1261. vllm/model_executor/models/vision.py +552 -0
  1262. vllm/model_executor/models/voxtral.py +842 -0
  1263. vllm/model_executor/models/whisper.py +963 -0
  1264. vllm/model_executor/models/zamba2.py +980 -0
  1265. vllm/model_executor/parameter.py +642 -0
  1266. vllm/model_executor/utils.py +94 -0
  1267. vllm/model_executor/warmup/__init__.py +0 -0
  1268. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1269. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1270. vllm/multimodal/__init__.py +40 -0
  1271. vllm/multimodal/audio.py +142 -0
  1272. vllm/multimodal/base.py +26 -0
  1273. vllm/multimodal/cache.py +830 -0
  1274. vllm/multimodal/evs.py +294 -0
  1275. vllm/multimodal/hasher.py +106 -0
  1276. vllm/multimodal/image.py +130 -0
  1277. vllm/multimodal/inputs.py +1036 -0
  1278. vllm/multimodal/parse.py +544 -0
  1279. vllm/multimodal/processing.py +2240 -0
  1280. vllm/multimodal/profiling.py +369 -0
  1281. vllm/multimodal/registry.py +357 -0
  1282. vllm/multimodal/utils.py +523 -0
  1283. vllm/multimodal/video.py +333 -0
  1284. vllm/outputs.py +345 -0
  1285. vllm/platforms/__init__.py +277 -0
  1286. vllm/platforms/cpu.py +410 -0
  1287. vllm/platforms/cuda.py +642 -0
  1288. vllm/platforms/interface.py +656 -0
  1289. vllm/platforms/rocm.py +513 -0
  1290. vllm/platforms/tpu.py +275 -0
  1291. vllm/platforms/xpu.py +261 -0
  1292. vllm/plugins/__init__.py +81 -0
  1293. vllm/plugins/io_processors/__init__.py +68 -0
  1294. vllm/plugins/io_processors/interface.py +77 -0
  1295. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1296. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1297. vllm/pooling_params.py +230 -0
  1298. vllm/profiler/__init__.py +0 -0
  1299. vllm/profiler/gpu_profiler.py +216 -0
  1300. vllm/profiler/layerwise_profile.py +392 -0
  1301. vllm/profiler/utils.py +151 -0
  1302. vllm/py.typed +2 -0
  1303. vllm/ray/__init__.py +0 -0
  1304. vllm/ray/lazy_utils.py +30 -0
  1305. vllm/ray/ray_env.py +79 -0
  1306. vllm/reasoning/__init__.py +92 -0
  1307. vllm/reasoning/abs_reasoning_parsers.py +290 -0
  1308. vllm/reasoning/basic_parsers.py +162 -0
  1309. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1310. vllm/reasoning/deepseek_v3_reasoning_parser.py +62 -0
  1311. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1312. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1313. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1314. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1315. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1316. vllm/reasoning/identity_reasoning_parser.py +58 -0
  1317. vllm/reasoning/minimax_m2_reasoning_parser.py +67 -0
  1318. vllm/reasoning/mistral_reasoning_parser.py +55 -0
  1319. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1320. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1321. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1322. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1323. vllm/sampling_params.py +597 -0
  1324. vllm/scalar_type.py +355 -0
  1325. vllm/scripts.py +17 -0
  1326. vllm/sequence.py +98 -0
  1327. vllm/tasks.py +13 -0
  1328. vllm/third_party/__init__.py +0 -0
  1329. vllm/third_party/pynvml.py +6140 -0
  1330. vllm/tokenizers/__init__.py +24 -0
  1331. vllm/tokenizers/detokenizer_utils.py +198 -0
  1332. vllm/tokenizers/hf.py +124 -0
  1333. vllm/tokenizers/mistral.py +554 -0
  1334. vllm/tokenizers/protocol.py +111 -0
  1335. vllm/tokenizers/registry.py +233 -0
  1336. vllm/tracing.py +135 -0
  1337. vllm/transformers_utils/__init__.py +26 -0
  1338. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1339. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1340. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1341. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1342. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1343. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1344. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1345. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1346. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1347. vllm/transformers_utils/config.py +1081 -0
  1348. vllm/transformers_utils/config_parser_base.py +20 -0
  1349. vllm/transformers_utils/configs/__init__.py +84 -0
  1350. vllm/transformers_utils/configs/afmoe.py +87 -0
  1351. vllm/transformers_utils/configs/arctic.py +216 -0
  1352. vllm/transformers_utils/configs/chatglm.py +75 -0
  1353. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1354. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1355. vllm/transformers_utils/configs/eagle.py +90 -0
  1356. vllm/transformers_utils/configs/falcon.py +89 -0
  1357. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1358. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1359. vllm/transformers_utils/configs/jais.py +243 -0
  1360. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1361. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1362. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1363. vllm/transformers_utils/configs/medusa.py +65 -0
  1364. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1365. vllm/transformers_utils/configs/mistral.py +235 -0
  1366. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1367. vllm/transformers_utils/configs/moonvit.py +33 -0
  1368. vllm/transformers_utils/configs/nemotron.py +214 -0
  1369. vllm/transformers_utils/configs/nemotron_h.py +282 -0
  1370. vllm/transformers_utils/configs/olmo3.py +83 -0
  1371. vllm/transformers_utils/configs/ovis.py +182 -0
  1372. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1373. vllm/transformers_utils/configs/radio.py +89 -0
  1374. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1375. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1376. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1377. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1378. vllm/transformers_utils/configs/ultravox.py +118 -0
  1379. vllm/transformers_utils/dynamic_module.py +59 -0
  1380. vllm/transformers_utils/gguf_utils.py +209 -0
  1381. vllm/transformers_utils/processor.py +423 -0
  1382. vllm/transformers_utils/processors/__init__.py +23 -0
  1383. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1384. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1385. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1386. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1387. vllm/transformers_utils/processors/ovis.py +453 -0
  1388. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1389. vllm/transformers_utils/repo_utils.py +287 -0
  1390. vllm/transformers_utils/runai_utils.py +104 -0
  1391. vllm/transformers_utils/s3_utils.py +95 -0
  1392. vllm/transformers_utils/tokenizer.py +127 -0
  1393. vllm/transformers_utils/tokenizer_base.py +33 -0
  1394. vllm/transformers_utils/utils.py +184 -0
  1395. vllm/triton_utils/__init__.py +20 -0
  1396. vllm/triton_utils/importing.py +103 -0
  1397. vllm/usage/__init__.py +0 -0
  1398. vllm/usage/usage_lib.py +294 -0
  1399. vllm/utils/__init__.py +66 -0
  1400. vllm/utils/argparse_utils.py +504 -0
  1401. vllm/utils/async_utils.py +310 -0
  1402. vllm/utils/cache.py +214 -0
  1403. vllm/utils/collection_utils.py +112 -0
  1404. vllm/utils/counter.py +45 -0
  1405. vllm/utils/deep_gemm.py +399 -0
  1406. vllm/utils/flashinfer.py +532 -0
  1407. vllm/utils/func_utils.py +236 -0
  1408. vllm/utils/gc_utils.py +151 -0
  1409. vllm/utils/hashing.py +81 -0
  1410. vllm/utils/import_utils.py +449 -0
  1411. vllm/utils/jsontree.py +158 -0
  1412. vllm/utils/math_utils.py +32 -0
  1413. vllm/utils/mem_constants.py +13 -0
  1414. vllm/utils/mem_utils.py +232 -0
  1415. vllm/utils/nccl.py +64 -0
  1416. vllm/utils/network_utils.py +331 -0
  1417. vllm/utils/platform_utils.py +59 -0
  1418. vllm/utils/profiling.py +56 -0
  1419. vllm/utils/registry.py +51 -0
  1420. vllm/utils/serial_utils.py +169 -0
  1421. vllm/utils/system_utils.py +265 -0
  1422. vllm/utils/tensor_schema.py +255 -0
  1423. vllm/utils/torch_utils.py +647 -0
  1424. vllm/v1/__init__.py +0 -0
  1425. vllm/v1/attention/__init__.py +0 -0
  1426. vllm/v1/attention/backends/__init__.py +0 -0
  1427. vllm/v1/attention/backends/cpu_attn.py +497 -0
  1428. vllm/v1/attention/backends/flash_attn.py +1050 -0
  1429. vllm/v1/attention/backends/flashinfer.py +1572 -0
  1430. vllm/v1/attention/backends/flex_attention.py +945 -0
  1431. vllm/v1/attention/backends/gdn_attn.py +387 -0
  1432. vllm/v1/attention/backends/linear_attn.py +77 -0
  1433. vllm/v1/attention/backends/mamba1_attn.py +165 -0
  1434. vllm/v1/attention/backends/mamba2_attn.py +354 -0
  1435. vllm/v1/attention/backends/mamba_attn.py +117 -0
  1436. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1437. vllm/v1/attention/backends/mla/aiter_triton_mla.py +74 -0
  1438. vllm/v1/attention/backends/mla/common.py +2069 -0
  1439. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1440. vllm/v1/attention/backends/mla/flashattn_mla.py +340 -0
  1441. vllm/v1/attention/backends/mla/flashinfer_mla.py +174 -0
  1442. vllm/v1/attention/backends/mla/flashmla.py +317 -0
  1443. vllm/v1/attention/backends/mla/flashmla_sparse.py +551 -0
  1444. vllm/v1/attention/backends/mla/indexer.py +369 -0
  1445. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +275 -0
  1446. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +325 -0
  1447. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1448. vllm/v1/attention/backends/pallas.py +436 -0
  1449. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1450. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1451. vllm/v1/attention/backends/rocm_attn.py +359 -0
  1452. vllm/v1/attention/backends/short_conv_attn.py +105 -0
  1453. vllm/v1/attention/backends/tree_attn.py +428 -0
  1454. vllm/v1/attention/backends/triton_attn.py +377 -0
  1455. vllm/v1/attention/backends/utils.py +1149 -0
  1456. vllm/v1/core/__init__.py +0 -0
  1457. vllm/v1/core/block_pool.py +466 -0
  1458. vllm/v1/core/encoder_cache_manager.py +343 -0
  1459. vllm/v1/core/kv_cache_coordinator.py +570 -0
  1460. vllm/v1/core/kv_cache_manager.py +408 -0
  1461. vllm/v1/core/kv_cache_metrics.py +96 -0
  1462. vllm/v1/core/kv_cache_utils.py +1471 -0
  1463. vllm/v1/core/sched/__init__.py +0 -0
  1464. vllm/v1/core/sched/async_scheduler.py +68 -0
  1465. vllm/v1/core/sched/interface.py +187 -0
  1466. vllm/v1/core/sched/output.py +230 -0
  1467. vllm/v1/core/sched/request_queue.py +217 -0
  1468. vllm/v1/core/sched/scheduler.py +1726 -0
  1469. vllm/v1/core/sched/utils.py +72 -0
  1470. vllm/v1/core/single_type_kv_cache_manager.py +801 -0
  1471. vllm/v1/cudagraph_dispatcher.py +183 -0
  1472. vllm/v1/engine/__init__.py +214 -0
  1473. vllm/v1/engine/async_llm.py +874 -0
  1474. vllm/v1/engine/coordinator.py +377 -0
  1475. vllm/v1/engine/core.py +1421 -0
  1476. vllm/v1/engine/core_client.py +1406 -0
  1477. vllm/v1/engine/detokenizer.py +351 -0
  1478. vllm/v1/engine/exceptions.py +18 -0
  1479. vllm/v1/engine/input_processor.py +636 -0
  1480. vllm/v1/engine/llm_engine.py +416 -0
  1481. vllm/v1/engine/logprobs.py +189 -0
  1482. vllm/v1/engine/output_processor.py +658 -0
  1483. vllm/v1/engine/parallel_sampling.py +145 -0
  1484. vllm/v1/engine/processor.py +20 -0
  1485. vllm/v1/engine/utils.py +1068 -0
  1486. vllm/v1/executor/__init__.py +6 -0
  1487. vllm/v1/executor/abstract.py +352 -0
  1488. vllm/v1/executor/multiproc_executor.py +888 -0
  1489. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1490. vllm/v1/executor/ray_executor.py +626 -0
  1491. vllm/v1/executor/ray_utils.py +465 -0
  1492. vllm/v1/executor/uniproc_executor.py +183 -0
  1493. vllm/v1/kv_cache_interface.py +404 -0
  1494. vllm/v1/kv_offload/__init__.py +0 -0
  1495. vllm/v1/kv_offload/abstract.py +161 -0
  1496. vllm/v1/kv_offload/arc_manager.py +237 -0
  1497. vllm/v1/kv_offload/backend.py +97 -0
  1498. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1499. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1500. vllm/v1/kv_offload/cpu.py +86 -0
  1501. vllm/v1/kv_offload/factory.py +56 -0
  1502. vllm/v1/kv_offload/lru_manager.py +139 -0
  1503. vllm/v1/kv_offload/mediums.py +39 -0
  1504. vllm/v1/kv_offload/spec.py +66 -0
  1505. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1506. vllm/v1/kv_offload/worker/cpu_gpu.py +191 -0
  1507. vllm/v1/kv_offload/worker/worker.py +144 -0
  1508. vllm/v1/metrics/__init__.py +0 -0
  1509. vllm/v1/metrics/loggers.py +1268 -0
  1510. vllm/v1/metrics/prometheus.py +82 -0
  1511. vllm/v1/metrics/ray_wrappers.py +194 -0
  1512. vllm/v1/metrics/reader.py +257 -0
  1513. vllm/v1/metrics/stats.py +431 -0
  1514. vllm/v1/outputs.py +237 -0
  1515. vllm/v1/pool/__init__.py +0 -0
  1516. vllm/v1/pool/metadata.py +82 -0
  1517. vllm/v1/request.py +280 -0
  1518. vllm/v1/sample/__init__.py +0 -0
  1519. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1520. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1521. vllm/v1/sample/logits_processor/interface.py +106 -0
  1522. vllm/v1/sample/logits_processor/state.py +165 -0
  1523. vllm/v1/sample/metadata.py +44 -0
  1524. vllm/v1/sample/ops/__init__.py +0 -0
  1525. vllm/v1/sample/ops/bad_words.py +52 -0
  1526. vllm/v1/sample/ops/logprobs.py +25 -0
  1527. vllm/v1/sample/ops/penalties.py +57 -0
  1528. vllm/v1/sample/ops/topk_topp_sampler.py +384 -0
  1529. vllm/v1/sample/rejection_sampler.py +805 -0
  1530. vllm/v1/sample/sampler.py +319 -0
  1531. vllm/v1/sample/tpu/__init__.py +0 -0
  1532. vllm/v1/sample/tpu/metadata.py +120 -0
  1533. vllm/v1/sample/tpu/sampler.py +215 -0
  1534. vllm/v1/serial_utils.py +532 -0
  1535. vllm/v1/spec_decode/__init__.py +0 -0
  1536. vllm/v1/spec_decode/eagle.py +1325 -0
  1537. vllm/v1/spec_decode/medusa.py +73 -0
  1538. vllm/v1/spec_decode/metadata.py +66 -0
  1539. vllm/v1/spec_decode/metrics.py +225 -0
  1540. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1541. vllm/v1/spec_decode/suffix_decoding.py +101 -0
  1542. vllm/v1/spec_decode/utils.py +121 -0
  1543. vllm/v1/structured_output/__init__.py +338 -0
  1544. vllm/v1/structured_output/backend_guidance.py +265 -0
  1545. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1546. vllm/v1/structured_output/backend_outlines.py +324 -0
  1547. vllm/v1/structured_output/backend_types.py +136 -0
  1548. vllm/v1/structured_output/backend_xgrammar.py +362 -0
  1549. vllm/v1/structured_output/request.py +94 -0
  1550. vllm/v1/structured_output/utils.py +469 -0
  1551. vllm/v1/utils.py +414 -0
  1552. vllm/v1/worker/__init__.py +0 -0
  1553. vllm/v1/worker/block_table.py +343 -0
  1554. vllm/v1/worker/cpu_model_runner.py +122 -0
  1555. vllm/v1/worker/cpu_worker.py +210 -0
  1556. vllm/v1/worker/dp_utils.py +250 -0
  1557. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1558. vllm/v1/worker/gpu/README.md +4 -0
  1559. vllm/v1/worker/gpu/__init__.py +0 -0
  1560. vllm/v1/worker/gpu/async_utils.py +97 -0
  1561. vllm/v1/worker/gpu/attn_utils.py +189 -0
  1562. vllm/v1/worker/gpu/block_table.py +314 -0
  1563. vllm/v1/worker/gpu/cudagraph_utils.py +259 -0
  1564. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1565. vllm/v1/worker/gpu/input_batch.py +430 -0
  1566. vllm/v1/worker/gpu/model_runner.py +1007 -0
  1567. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1568. vllm/v1/worker/gpu/sample/gumbel.py +101 -0
  1569. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1570. vllm/v1/worker/gpu/sample/metadata.py +179 -0
  1571. vllm/v1/worker/gpu/sample/penalties.py +154 -0
  1572. vllm/v1/worker/gpu/sample/sampler.py +75 -0
  1573. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1574. vllm/v1/worker/gpu/spec_decode/eagle.py +565 -0
  1575. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1576. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +83 -0
  1577. vllm/v1/worker/gpu/states.py +309 -0
  1578. vllm/v1/worker/gpu/structured_outputs.py +76 -0
  1579. vllm/v1/worker/gpu_input_batch.py +971 -0
  1580. vllm/v1/worker/gpu_model_runner.py +5360 -0
  1581. vllm/v1/worker/gpu_ubatch_wrapper.py +472 -0
  1582. vllm/v1/worker/gpu_worker.py +922 -0
  1583. vllm/v1/worker/kv_connector_model_runner_mixin.py +309 -0
  1584. vllm/v1/worker/lora_model_runner_mixin.py +212 -0
  1585. vllm/v1/worker/tpu_input_batch.py +583 -0
  1586. vllm/v1/worker/tpu_model_runner.py +2196 -0
  1587. vllm/v1/worker/tpu_worker.py +351 -0
  1588. vllm/v1/worker/ubatch_utils.py +73 -0
  1589. vllm/v1/worker/ubatching.py +231 -0
  1590. vllm/v1/worker/utils.py +365 -0
  1591. vllm/v1/worker/worker_base.py +377 -0
  1592. vllm/v1/worker/xpu_model_runner.py +48 -0
  1593. vllm/v1/worker/xpu_worker.py +198 -0
  1594. vllm/version.py +39 -0
  1595. vllm/vllm_flash_attn/.gitkeep +0 -0
  1596. vllm_cpu-0.12.0.dist-info/METADATA +300 -0
  1597. vllm_cpu-0.12.0.dist-info/RECORD +1600 -0
  1598. vllm_cpu-0.12.0.dist-info/WHEEL +5 -0
  1599. vllm_cpu-0.12.0.dist-info/entry_points.txt +5 -0
  1600. vllm_cpu-0.12.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1471 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """KV-Cache Utilities."""
4
+
5
+ import copy
6
+ import os
7
+ from collections import defaultdict
8
+ from collections.abc import Callable, Iterable, Iterator, Sequence
9
+ from dataclasses import dataclass, replace
10
+ from typing import Any, NewType, TypeAlias, overload
11
+
12
+ from vllm import envs
13
+ from vllm.config import VllmConfig
14
+ from vllm.logger import init_logger
15
+ from vllm.utils.hashing import sha256_cbor
16
+ from vllm.utils.math_utils import cdiv
17
+ from vllm.utils.mem_constants import GiB_bytes
18
+ from vllm.v1.kv_cache_interface import (
19
+ ChunkedLocalAttentionSpec,
20
+ FullAttentionSpec,
21
+ KVCacheConfig,
22
+ KVCacheGroupSpec,
23
+ KVCacheSpec,
24
+ KVCacheTensor,
25
+ SlidingWindowSpec,
26
+ UniformTypeKVCacheSpecs,
27
+ )
28
+ from vllm.v1.request import Request
29
+ from vllm.v1.utils import tensor_data
30
+
31
+ # BlockHash represents the hash of a single KV-cache block used for
32
+ # prefix caching. Treating it as a distinct type from `bytes` helps
33
+ # catch accidental misuse when passing around raw byte strings.
34
+ BlockHash = NewType("BlockHash", bytes)
35
+
36
+ # `BlockHashWithGroupId` combines a `BlockHash` with its KV cache group ID.
37
+ # It is represented as raw bytes for compactness and efficiency. The helper
38
+ # functions below pack/unpack the `BlockHash` and group id into/from the key.
39
+ BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
40
+
41
+ # ExternalBlockHash is used for reproducible prefix-cache block hashing.
42
+ # It's a union of `bytes` and `int` to keep backward compatibility
43
+ # after we default block hashing to use sha256 bytes.
44
+ ExternalBlockHash: TypeAlias = bytes | int
45
+
46
+
47
+ def make_block_hash_with_group_id(
48
+ block_hash: BlockHash, group_id: int
49
+ ) -> BlockHashWithGroupId:
50
+ """Pack a `BlockHash` and group id into a `BlockHashWithGroupId`.
51
+
52
+ The group id is encoded using 4 bytes in big-endian order and appended to
53
+ the block hash bytes. This representation avoids creating tuples while
54
+ still allowing us to recover both components when needed.
55
+ """
56
+ return BlockHashWithGroupId(block_hash + group_id.to_bytes(4, "big", signed=False))
57
+
58
+
59
+ def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
60
+ """Extract the `BlockHash` from a `BlockHashWithGroupId`."""
61
+ return BlockHash(key[:-4])
62
+
63
+
64
+ def get_group_id(key: BlockHashWithGroupId) -> int:
65
+ """Extract the group id from a `BlockHashWithGroupId`."""
66
+ return int.from_bytes(key[-4:], "big", signed=False)
67
+
68
+
69
+ def maybe_convert_block_hash(hash_bytes: BlockHash) -> ExternalBlockHash:
70
+ if not envs.VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES:
71
+ return hash_bytes
72
+ return int.from_bytes(hash_bytes, byteorder="big") & ((1 << 64) - 1)
73
+
74
+
75
+ logger = init_logger(__name__)
76
+
77
+ # The hash seed for the first block of any prefix block sequence.
78
+ #
79
+ # We use a random value to avoid hash collisions or PYTHONHASHSEED environment
80
+ # variable if set such that processes can share the seed if needed. This aligns
81
+ # with the behavior of Python's hash() function, which also uses a random seed
82
+ # if PYTHONHASHSEED is not set.
83
+ #
84
+ # The function `init_none_hash` initializes this variable globally.
85
+ NONE_HASH: BlockHash
86
+
87
+
88
+ def init_none_hash(hash_fn: Callable[[Any], bytes]):
89
+ global NONE_HASH
90
+
91
+ hash_seed = os.getenv("PYTHONHASHSEED")
92
+ if hash_seed is None and hash_fn is sha256_cbor:
93
+ logger.warning(
94
+ "PYTHONHASHSEED is not set. This will lead to non-reproducible "
95
+ "block-hashes when using sha256_cbor as the hash function."
96
+ "Consider setting PYTHONHASHSEED to a fixed value for "
97
+ "reproducibility."
98
+ )
99
+
100
+ if hash_seed is None:
101
+ NONE_HASH = BlockHash(os.urandom(32))
102
+ else:
103
+ NONE_HASH = BlockHash(hash_fn(hash_seed))
104
+
105
+
106
+ @dataclass
107
+ class KVCacheBlock:
108
+ """KV-cache block metadata."""
109
+
110
+ # Block ID, ranging from 0 to num_gpu_blocks - 1.
111
+ block_id: int
112
+ # Reference count.
113
+ ref_cnt: int = 0
114
+ # The hash key (block hash + group id) of the block, only available
115
+ # when the block is full and cached.
116
+ _block_hash: BlockHashWithGroupId | None = None
117
+
118
+ # Used to construct a doubly linked list for free blocks.
119
+ # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
120
+ prev_free_block: "KVCacheBlock | None" = None
121
+ next_free_block: "KVCacheBlock | None" = None
122
+
123
+ # Whether the block is a null block that should never be cached.
124
+ is_null: bool = False
125
+
126
+ @property
127
+ def block_hash(self) -> BlockHashWithGroupId | None:
128
+ return self._block_hash
129
+
130
+ @block_hash.setter
131
+ def block_hash(self, block_hash: BlockHashWithGroupId):
132
+ assert self.block_hash is None, (
133
+ "The block already has a hash. This should not happen."
134
+ )
135
+ self._block_hash = block_hash
136
+
137
+ def reset_hash(self):
138
+ """Reset the block hash when the block is evicted."""
139
+ self._block_hash = None
140
+
141
+ def __repr__(self) -> str:
142
+ # Use block_id instead of KVCacheBlock object to avoid calling __repr__
143
+ # on KVCacheBlock object recursively.
144
+ prev_block_id = self.prev_free_block.block_id if self.prev_free_block else None
145
+ next_block_id = self.next_free_block.block_id if self.next_free_block else None
146
+ return (
147
+ f"KVCacheBlock(block_id={self.block_id}, "
148
+ f"ref_cnt={self.ref_cnt}, "
149
+ f"_block_hash={self._block_hash!r}, "
150
+ f"prev_free_block={prev_block_id}, "
151
+ f"next_free_block={next_block_id})"
152
+ )
153
+
154
+
155
+ class FreeKVCacheBlockQueue:
156
+ """This class organizes a list of KVCacheBlock objects to a doubly linked
157
+ list of free blocks. We implement this class instead of using Python
158
+ builtin deque to support removing a block in the middle of the queue
159
+ in O(1) time. To close the performance gap to the builtin deque which is
160
+ implemented in C++, this class does not allocate any Python objects when
161
+ manipulating the linked list. Instead, this class manipulates the
162
+ prev_free_block and next_free_block attributes of the given blocks.
163
+
164
+ The queue is ordered by block ID in the beginning. When a block is allocated
165
+ and then freed, it will be appended back with the eviction order:
166
+ 1. The least recent used block is at the front (LRU).
167
+ 2. If two blocks have the same last accessed time (allocated by the
168
+ same sequence), the one with more hash tokens (the tail of a block
169
+ chain) is at the front.
170
+ Note that we maintain this order by reversing the block order when free
171
+ blocks of a request. This operation is outside of this class.
172
+
173
+ Args:
174
+ blocks: A list of KVCacheBlock objects.
175
+ """
176
+
177
+ def __init__(self, blocks: list[KVCacheBlock]) -> None:
178
+ self.num_free_blocks = len(blocks)
179
+
180
+ # Initialize doubly links of consecutive blocks
181
+ for i in range(self.num_free_blocks):
182
+ if i > 0:
183
+ blocks[i].prev_free_block = blocks[i - 1]
184
+ if i < self.num_free_blocks - 1:
185
+ blocks[i].next_free_block = blocks[i + 1]
186
+
187
+ # Create a fake head and a tail block for the doubly linked list to
188
+ # reduce branching in the code
189
+ #
190
+ # The implementation guaranteed that the fake head and tail
191
+ # are NEVER got popped, so we could safely assume each real blocks
192
+ # in the queue has prev and next blocks.
193
+ self.fake_free_list_head = KVCacheBlock(block_id=-1)
194
+ self.fake_free_list_tail = KVCacheBlock(block_id=-1)
195
+ if self.num_free_blocks > 0:
196
+ # Connect fake_head and fake_tail to the first and last block
197
+ # respectively.
198
+ self.fake_free_list_head.next_free_block = blocks[0]
199
+ blocks[0].prev_free_block = self.fake_free_list_head
200
+ self.fake_free_list_tail.prev_free_block = blocks[-1]
201
+ blocks[-1].next_free_block = self.fake_free_list_tail
202
+ else:
203
+ # For empty list, simply connect the fake head and tail.
204
+ self.fake_free_list_head.next_free_block = self.fake_free_list_tail
205
+ self.fake_free_list_tail.prev_free_block = self.fake_free_list_head
206
+
207
+ def popleft(self) -> KVCacheBlock:
208
+ """Pop the first free block and reduce num_free_blocks by 1.
209
+
210
+ Returns:
211
+ The first free block.
212
+ """
213
+ if (
214
+ self.fake_free_list_head.next_free_block is self.fake_free_list_tail
215
+ or self.fake_free_list_head.next_free_block is None
216
+ ):
217
+ assert self.num_free_blocks == 0, (
218
+ f"num_free_blocks ({self.num_free_blocks}) is out of sync "
219
+ "with the free list."
220
+ )
221
+ raise ValueError("No free blocks available")
222
+
223
+ first_block: KVCacheBlock = self.fake_free_list_head.next_free_block
224
+
225
+ if first_block.next_free_block is None:
226
+ # This should not happen if the block is from the free list.
227
+ # It indicates a bug in the caller's logic.
228
+ raise RuntimeError(
229
+ "Invalid block found in popleft() "
230
+ "which doesn't have a valid next_free_block"
231
+ )
232
+
233
+ # Connect fake_head and the next block of first_block (i.e. second block
234
+ # or fake tail).
235
+ self.fake_free_list_head.next_free_block = first_block.next_free_block
236
+ first_block.next_free_block.prev_free_block = self.fake_free_list_head
237
+
238
+ # Remove the block from the linked list.
239
+ first_block.prev_free_block = first_block.next_free_block = None
240
+
241
+ self.num_free_blocks -= 1
242
+ return first_block
243
+
244
+ def popleft_n(self, n: int) -> list[KVCacheBlock]:
245
+ """Pop the first n free blocks and reduce num_free_blocks by n.
246
+
247
+ Args:
248
+ n: The number of blocks to pop.
249
+
250
+ Returns:
251
+ A list of n free blocks.
252
+ """
253
+ if n == 0:
254
+ return []
255
+ assert self.num_free_blocks >= n
256
+ self.num_free_blocks -= n
257
+
258
+ curr_block = self.fake_free_list_head.next_free_block
259
+ # Pop n blocks from the head of the list
260
+ ret = []
261
+ for _ in range(n):
262
+ assert curr_block is not None
263
+ ret.append(curr_block)
264
+ last_block = curr_block
265
+ curr_block = curr_block.next_free_block
266
+ # Reset prev_free_block and next_free_block of all popped blocks
267
+ last_block.prev_free_block = None
268
+ last_block.next_free_block = None
269
+
270
+ if curr_block is not None:
271
+ # The queue is not empty, connect the fake head to
272
+ # the new first block.
273
+ self.fake_free_list_head.next_free_block = curr_block
274
+ curr_block.prev_free_block = self.fake_free_list_head
275
+ return ret
276
+
277
+ def remove(self, block: KVCacheBlock) -> None:
278
+ """Remove a block in the free list and reduce num_free_blocks by 1.
279
+
280
+ Args:
281
+ block: The block to remove.
282
+ """
283
+ if block.prev_free_block is None or block.next_free_block is None:
284
+ # This should not happen if the block is from the free list.
285
+ # It indicates a bug in the caller's logic.
286
+ raise RuntimeError(f"remove() called on an invalid block: {block}")
287
+
288
+ # Link the previous block to the next block.
289
+ block.prev_free_block.next_free_block = block.next_free_block
290
+ # Link the next block to the previous block.
291
+ block.next_free_block.prev_free_block = block.prev_free_block
292
+
293
+ # Remove the block from the linked list.
294
+ block.prev_free_block = block.next_free_block = None
295
+ self.num_free_blocks -= 1
296
+
297
+ def append(self, block: KVCacheBlock) -> None:
298
+ """Put a block back into the free list and increase
299
+ num_free_blocks by 1.
300
+
301
+ Args:
302
+ block: The block to append.
303
+ """
304
+ if self.fake_free_list_tail.prev_free_block is None:
305
+ raise RuntimeError(
306
+ "prev_free_block of fake_free_list_tail should always exist"
307
+ )
308
+ last_block: KVCacheBlock = self.fake_free_list_tail.prev_free_block
309
+
310
+ # Connect the new block after the last block.
311
+ last_block.next_free_block = block
312
+ block.prev_free_block = last_block
313
+
314
+ # Connect the fake tail after the new block.
315
+ block.next_free_block = self.fake_free_list_tail
316
+ self.fake_free_list_tail.prev_free_block = block
317
+
318
+ self.num_free_blocks += 1
319
+
320
+ def append_n(self, blocks: list[KVCacheBlock]) -> None:
321
+ """Put a list of blocks back into the free list
322
+
323
+ Args:
324
+ blocks: The blocks to append.
325
+ """
326
+ if len(blocks) == 0:
327
+ return
328
+
329
+ last_block = self.fake_free_list_tail.prev_free_block
330
+ assert last_block is not None, (
331
+ "prev_free_block of fake_free_list_tail should always exist"
332
+ )
333
+ # Add inter-connections between consecutive blocks
334
+ for block in blocks:
335
+ block.prev_free_block = last_block
336
+ last_block.next_free_block = block
337
+ last_block = block
338
+
339
+ # Connect the last block of <blocks> to the fake tail
340
+ last_block.next_free_block = self.fake_free_list_tail
341
+ self.fake_free_list_tail.prev_free_block = last_block
342
+
343
+ self.num_free_blocks += len(blocks)
344
+
345
+ def get_all_free_blocks(self) -> list[KVCacheBlock]:
346
+ """Get all free blocks in the free list. Mainly used for testing.
347
+
348
+ Returns:
349
+ A list of free blocks.
350
+ """
351
+ ret = []
352
+ if self.fake_free_list_head.next_free_block is None:
353
+ raise RuntimeError(
354
+ "next_free_block of fake_free_list_head should always exist"
355
+ )
356
+ # Start from the first block
357
+ curr_block: KVCacheBlock = self.fake_free_list_head.next_free_block
358
+ # As long as next_free_block is available, we haven't reached to
359
+ # the fake tail yet.
360
+ while curr_block.next_free_block is not None:
361
+ ret.append(curr_block)
362
+ curr_block = curr_block.next_free_block
363
+ return ret
364
+
365
+
366
+ def need_extra_keys(request: Request) -> bool:
367
+ """Check whether the blocks allocated to this request need extra hash keys.
368
+
369
+ Args:
370
+ request (Request): The request.
371
+
372
+ Returns:
373
+ bool: Whether blocks allocated to this request need extra hash keys.
374
+ """
375
+
376
+ # Multimodal requests need to include the MM hash.
377
+ # LoRA requests need to include the LoRA name.
378
+ # Request with provided cache salt need to include the salt.
379
+ return (
380
+ bool(request.mm_features)
381
+ or (request.lora_request is not None)
382
+ or (request.cache_salt is not None)
383
+ )
384
+
385
+
386
+ def _gen_mm_extra_hash_keys(
387
+ request: Request, start_token_idx: int, end_token_idx: int, start_mm_idx: int
388
+ ) -> tuple[list[Any], int]:
389
+ """Generate extra keys related to MultiModal request for block hash
390
+ computation. For multi-modal inputs, the extra keys are
391
+ (mm_hash, start_offset) that indicate a mm input contained in the
392
+ block and its starting offset in the block tokens.
393
+
394
+ Args:
395
+ request: The request object.
396
+ start_token_idx: The start token index of the block.
397
+ end_token_idx: The end token index of the block.
398
+ start_mm_idx: The start multi-modal index of the block.
399
+
400
+ Returns:
401
+ A tuple of extra keys and the next multi-modal index.
402
+ """
403
+ extra_keys: list[Any] = []
404
+
405
+ mm_features = request.mm_features
406
+ if not mm_features:
407
+ return extra_keys, start_mm_idx
408
+
409
+ # Note that we assume mm_features are sorted by mm_position.offset.
410
+ # We do not need to check all mm inputs if the start token index is out of
411
+ # range. This usually happens in the late prefill phase and decoding phase.
412
+ last_pos = mm_features[-1].mm_position
413
+ if last_pos.offset + last_pos.length < start_token_idx:
414
+ return extra_keys, start_mm_idx
415
+
416
+ # Support start_mm_idx == -1 to indicate the last mm input.
417
+ if start_mm_idx < 0:
418
+ assert -start_mm_idx <= len(mm_features)
419
+ start_mm_idx = len(mm_features) + start_mm_idx
420
+
421
+ curr_mm_idx = start_mm_idx
422
+ while mm_features and curr_mm_idx < len(mm_features):
423
+ mm_feature = mm_features[curr_mm_idx]
424
+ assert mm_feature.identifier is not None
425
+ offset = mm_feature.mm_position.offset
426
+ length = mm_feature.mm_position.length
427
+ if end_token_idx > offset:
428
+ if start_token_idx > offset + length:
429
+ # This block has passed the current mm input.
430
+ curr_mm_idx += 1
431
+ continue
432
+
433
+ # The block contains the current mm input.
434
+ extra_keys.append(mm_feature.identifier)
435
+
436
+ if end_token_idx >= offset + length:
437
+ # If this block contains the end of the current mm input,
438
+ # move to the next mm input as this block may also contain
439
+ # the next mm input.
440
+ curr_mm_idx += 1
441
+ else:
442
+ # Otherwise this block is done with mm inputs.
443
+ break
444
+ else:
445
+ # This block has not reached the current mm input.
446
+ break
447
+ return extra_keys, curr_mm_idx
448
+
449
+
450
+ def _gen_lora_extra_hash_keys(request: Request) -> list[str]:
451
+ """Generate extra keys related to LoRA for block hash computation.
452
+
453
+ Args:
454
+ request: The request object.
455
+
456
+ Returns:
457
+ Return LoRA name of the request if it is a LoRA request. Return empty
458
+ list otherwise.
459
+ """
460
+ if not request.lora_request:
461
+ return []
462
+ return [request.lora_request.lora_name]
463
+
464
+
465
+ def _gen_prompt_embeds_extra_hash_keys(
466
+ request: Request, start_token_idx: int, end_token_idx: int
467
+ ) -> list[bytes]:
468
+ """Generate extra keys related to prompt embeds for block hash computation.
469
+
470
+ Args:
471
+ request: The request object.
472
+ start_token_idx: The start token index of the block.
473
+ end_token_idx: The end token index of the block.
474
+
475
+ Returns:
476
+ Return prompt embeddings data of the request if it has prompt embeds.
477
+ Return empty list otherwise.
478
+ """
479
+ if request.prompt_embeds is None:
480
+ return []
481
+ block_prompt_embeds = request.prompt_embeds[start_token_idx:end_token_idx]
482
+ embeds_bytes = tensor_data(block_prompt_embeds).tobytes()
483
+ return [embeds_bytes]
484
+
485
+
486
+ def generate_block_hash_extra_keys(
487
+ request: Request, start_token_idx: int, end_token_idx: int, start_mm_idx: int
488
+ ) -> tuple[tuple[Any, ...] | None, int]:
489
+ """Generate extra keys for the block hash. The extra keys can come from
490
+ the multi-modal inputs, request specific metadata (e.g., LoRA names), and
491
+ data from prompt embeddings.
492
+
493
+ Args:
494
+ request: The request object.
495
+ start_token_idx: The start token index of the block.
496
+ end_token_idx: The end token index of the block.
497
+ start_mm_idx: The start multi-modal index of the block.
498
+
499
+ Returns:
500
+ A tuple of extra keys and the next multi-modal index.
501
+ """
502
+ mm_extra_keys: list[Any]
503
+ mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
504
+ request, start_token_idx, end_token_idx, start_mm_idx
505
+ )
506
+ lora_extra_keys: list[str] = _gen_lora_extra_hash_keys(request)
507
+ cache_salt_keys: list[str] = (
508
+ [request.cache_salt] if (start_token_idx == 0 and request.cache_salt) else []
509
+ )
510
+ prompt_embeds_keys = _gen_prompt_embeds_extra_hash_keys(
511
+ request, start_token_idx, end_token_idx
512
+ )
513
+
514
+ extra_keys: list[Any] = (
515
+ lora_extra_keys + mm_extra_keys + cache_salt_keys + prompt_embeds_keys
516
+ )
517
+
518
+ if not extra_keys:
519
+ return None, new_start_mm_idx
520
+
521
+ return tuple(extra_keys), new_start_mm_idx
522
+
523
+
524
+ def hash_block_tokens(
525
+ hash_function: Callable[[Any], bytes],
526
+ parent_block_hash: BlockHash | None,
527
+ curr_block_token_ids: Sequence[int],
528
+ extra_keys: tuple[Any, ...] | None = None,
529
+ ) -> BlockHash:
530
+ """Computes a hash value corresponding to the contents of a block and
531
+ the contents of the preceding block(s). The hash value is used for
532
+ prefix caching. We use LRU cache for this function to avoid recomputing
533
+ hash values for the same block contents.
534
+ Args:
535
+ hash_function: The hash function used to compute block hash.
536
+ parent_block_hash: The hash of the parent block. None
537
+ if this is the first block.
538
+ curr_block_token_ids: A list of token ids in the current
539
+ block. The current block is assumed to be full.
540
+ extra_keys: Extra keys for the block.
541
+ Returns:
542
+ The hash value of the block and the token ids in the block.
543
+ The entire tuple is used as the hash key of the block.
544
+ """
545
+ if not parent_block_hash:
546
+ parent_block_hash = NONE_HASH
547
+
548
+ curr_block_token_ids_tuple = tuple(curr_block_token_ids)
549
+ return BlockHash(
550
+ hash_function((parent_block_hash, curr_block_token_ids_tuple, extra_keys))
551
+ )
552
+
553
+
554
+ def get_request_block_hasher(
555
+ block_size: int,
556
+ caching_hash_fn: Callable[[Any], bytes],
557
+ ) -> Callable[[Request], list[BlockHash]]:
558
+ """
559
+ Returns a function which computes the list of un-computed block hashes
560
+ of a request."""
561
+
562
+ def request_block_hasher(request: Request) -> list[BlockHash]:
563
+ start_token_idx = len(request.block_hashes) * block_size
564
+ num_tokens = request.num_tokens
565
+
566
+ if start_token_idx + block_size > num_tokens:
567
+ # Early stop when there no new full blocks created.
568
+ return []
569
+
570
+ curr_mm_idx = 0
571
+ if start_token_idx > 0:
572
+ # Set curr_mm_idx = -1 to indicate the last mm input.
573
+ # Note that since we reach to this branch only when the block is
574
+ # completed with generated tokens, we only need to consider the
575
+ # last mm input.
576
+ curr_mm_idx = -1
577
+
578
+ prev_block_hash_value = (
579
+ request.block_hashes[-1] if request.block_hashes else None
580
+ )
581
+ new_block_hashes: list[BlockHash] = []
582
+ while True:
583
+ end_token_idx = start_token_idx + block_size
584
+ if end_token_idx > num_tokens:
585
+ # We only hash full blocks
586
+ break
587
+
588
+ # MM and LoRA requests need extra keys for block-hash computation.
589
+ extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
590
+ request, start_token_idx, end_token_idx, curr_mm_idx
591
+ )
592
+
593
+ # Compute the hash of the current block
594
+ block_tokens = request.all_token_ids[start_token_idx:end_token_idx]
595
+ block_hash = hash_block_tokens(
596
+ caching_hash_fn, prev_block_hash_value, block_tokens, extra_keys
597
+ )
598
+
599
+ new_block_hashes.append(block_hash)
600
+ start_token_idx += block_size
601
+ prev_block_hash_value = block_hash
602
+
603
+ return new_block_hashes
604
+
605
+ return request_block_hasher
606
+
607
+
608
+ def max_memory_usage_bytes(
609
+ vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec]
610
+ ) -> int:
611
+ """
612
+ Get the maximum memory usage in bytes for the given KV cache specs.
613
+ """
614
+ return sum(spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs)
615
+
616
+
617
+ def estimate_max_model_len(
618
+ vllm_config: VllmConfig,
619
+ kv_cache_spec: dict[str, KVCacheSpec],
620
+ available_memory: int,
621
+ ) -> int:
622
+ """
623
+ Estimates the maximum model length that can fit in the available memory
624
+ using binary search.
625
+
626
+ Args:
627
+ vllm_config: The global VllmConfig
628
+ kv_cache_spec: The kv cache spec of each attention layer in the model
629
+ available_memory: Memory available for KV cache in bytes.
630
+
631
+ Returns:
632
+ The estimated maximum model length that can fit in the available memory.
633
+ """
634
+
635
+ # Define a function to check if a given model length fits in memory
636
+ def fits_in_memory(model_len: int) -> bool:
637
+ # Modify the max_model_len for this calculation
638
+ vllm_config.model_config.max_model_len = model_len
639
+ # Calculate memory needed for the given model length
640
+ memory_needed = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
641
+ return memory_needed <= available_memory
642
+
643
+ # Binary search for the maximum model length
644
+ current_max = vllm_config.model_config.max_model_len
645
+ left, right = 1, current_max
646
+
647
+ # If even the smallest model length doesn't fit, return 0
648
+ if not fits_in_memory(left):
649
+ return 0
650
+
651
+ # Binary search for the maximum model length that fits
652
+ result = 1
653
+ while left <= right:
654
+ mid = (left + right) // 2
655
+ if fits_in_memory(mid):
656
+ result = mid
657
+ left = mid + 1
658
+ else:
659
+ right = mid - 1
660
+ return result
661
+
662
+
663
+ def check_enough_kv_cache_memory(
664
+ vllm_config: VllmConfig,
665
+ kv_cache_spec: dict[str, KVCacheSpec],
666
+ available_memory: int,
667
+ ):
668
+ """
669
+ Checks whether `available_memory` is enough for the KV cache to hold at
670
+ least one request with the model's max_model_len.
671
+
672
+ Args:
673
+ vllm_config: The global VllmConfig
674
+ kv_cache_spec: The kv cache spec of each attention layer in the model
675
+ available_memory: Memory available for KV cache in bytes.
676
+
677
+ Raises:
678
+ ValueError: If there is not enough memory available for the KV cache.
679
+ """
680
+
681
+ # No need to check for available memory if the kv_cache_spec is empty
682
+ if not kv_cache_spec:
683
+ return
684
+
685
+ if available_memory <= 0:
686
+ raise ValueError(
687
+ "No available memory for the cache blocks. "
688
+ "Try increasing `gpu_memory_utilization` when "
689
+ "initializing the engine."
690
+ )
691
+
692
+ max_model_len = vllm_config.model_config.max_model_len
693
+ needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
694
+
695
+ if needed_memory > available_memory:
696
+ # Estimate the maximum model length that can fit in the available memory
697
+ estimated_max_len = estimate_max_model_len(
698
+ vllm_config, kv_cache_spec, available_memory
699
+ )
700
+ estimated_msg = ""
701
+ if estimated_max_len > 0:
702
+ estimated_msg = (
703
+ "Based on the available memory, "
704
+ f"the estimated maximum model length is {estimated_max_len}."
705
+ )
706
+
707
+ raise ValueError(
708
+ f"To serve at least one request with the models's max seq len "
709
+ f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
710
+ f"cache is needed, which is larger than the available KV cache "
711
+ f"memory ({available_memory / GiB_bytes:.2f} GiB). "
712
+ f"{estimated_msg} "
713
+ f"Try increasing `gpu_memory_utilization` or decreasing "
714
+ f"`max_model_len` when initializing the engine."
715
+ )
716
+
717
+
718
+ def create_kv_cache_group_specs(
719
+ kv_cache_spec: dict[str, KVCacheSpec], grouped_layer_names: list[list[str]]
720
+ ) -> list[KVCacheGroupSpec]:
721
+ """
722
+ Create KVCacheGroupSpec object for each kv cache group layer.
723
+ The layers in the same group should share the same
724
+ KVCacheSpec.
725
+
726
+ Args:
727
+ kv_cache_spec:
728
+ A mapping from each layer name to its corresponding KVCacheSpec.
729
+ grouped_layer_names:
730
+ A list of kv cache groups, where each element is a list of layer
731
+ names that belong to the same group and should share the same
732
+ KVCacheSpec.
733
+ Returns:
734
+ A list of KVCacheGroupSpec objects, one for each group.
735
+ """
736
+ kv_cache_groups = []
737
+ for layer_names_one_group in grouped_layer_names:
738
+ layer_specs = [
739
+ kv_cache_spec[layer_name] for layer_name in layer_names_one_group
740
+ ]
741
+ merged_layer_spec = layer_specs[0].merge(layer_specs)
742
+ kv_cache_groups.append(
743
+ KVCacheGroupSpec(layer_names_one_group, merged_layer_spec)
744
+ )
745
+ return kv_cache_groups
746
+
747
+
748
+ def is_kv_cache_spec_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
749
+ """
750
+ Whether all layers in the given KVCacheSpec have the same KV cache spec.
751
+ Note that we regard FullAttentionSpec with and without sliding window as
752
+ the same type.
753
+
754
+ Args:
755
+ kv_cache_spec: The kv cache spec of each attention layer in the model
756
+
757
+ Returns:
758
+ True if all layers have the same type, False otherwise.
759
+ """
760
+
761
+ if not kv_cache_spec:
762
+ # Encoder-only models do not have KV cache, kv_cache_type can be
763
+ # regarded as uniform.
764
+ return True
765
+ try:
766
+ kv_cache_spec_values = list(kv_cache_spec.values())
767
+ _ = kv_cache_spec_values[0].merge(kv_cache_spec_values)
768
+ except AssertionError:
769
+ return False
770
+ return True
771
+
772
+
773
+ def get_max_concurrency_for_kv_cache_config(
774
+ vllm_config: VllmConfig, kv_cache_config: KVCacheConfig
775
+ ) -> float:
776
+ """
777
+ Get the maximum concurrency for the given KV cache configuration.
778
+ """
779
+ num_layer_per_group = max(
780
+ len(group.layer_names) for group in kv_cache_config.kv_cache_groups
781
+ )
782
+ max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes(
783
+ vllm_config, (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups)
784
+ )
785
+ memory_per_block = (
786
+ kv_cache_config.kv_cache_groups[0].kv_cache_spec.page_size_bytes
787
+ * num_layer_per_group
788
+ )
789
+ num_block_per_request = cdiv(max_memory_usage_per_request, memory_per_block)
790
+ max_concurrency = kv_cache_config.num_blocks / num_block_per_request
791
+ return max_concurrency
792
+
793
+
794
+ def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int:
795
+ """
796
+ Override the number of kv cache blocks if `num_gpu_blocks_override` is set.
797
+ """
798
+ if vllm_config.cache_config.num_gpu_blocks_override is not None:
799
+ num_gpu_blocks_override = vllm_config.cache_config.num_gpu_blocks_override
800
+ logger.info(
801
+ "Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
802
+ num_blocks,
803
+ num_gpu_blocks_override,
804
+ )
805
+ num_blocks = num_gpu_blocks_override
806
+
807
+ return num_blocks
808
+
809
+
810
+ def get_num_blocks(
811
+ vllm_config: VllmConfig, num_layers: int, available_memory: int, page_size: int
812
+ ) -> int:
813
+ """
814
+ Get the number of kv cache blocks.
815
+
816
+ Args:
817
+ vllm_config: The global VllmConfig
818
+ num_layers: The number of layers
819
+ available_memory: Memory available for KV cache in bytes.
820
+ page_size: The page size of the KV cache.
821
+ """
822
+ num_blocks = int(available_memory // page_size // num_layers)
823
+ num_blocks = max(num_blocks, 0)
824
+ num_blocks = may_override_num_blocks(vllm_config, num_blocks)
825
+ return num_blocks
826
+
827
+
828
+ def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
829
+ """
830
+ Get the page size of the KV cache.
831
+ """
832
+ page_sizes = {layer.page_size_bytes for layer in kv_cache_specs}
833
+ assert len(page_sizes) == 1
834
+ return page_sizes.pop()
835
+
836
+
837
+ def _get_kv_cache_groups_uniform_spec(
838
+ kv_cache_specs: dict[str, KVCacheSpec],
839
+ ) -> list[KVCacheGroupSpec]:
840
+ """
841
+ Generates the KV cache configuration for a model with the same KV cache
842
+ spec for all layers.
843
+
844
+ Args:
845
+ kv_cache_specs: The kv cache spec of each attention layer in the model
846
+
847
+ Returns:
848
+ The generated KVCacheGroupSpecs
849
+ """
850
+
851
+ return create_kv_cache_group_specs(kv_cache_specs, [list(kv_cache_specs.keys())])
852
+
853
+
854
+ def _get_kv_cache_groups_uniform_type(
855
+ spec: UniformTypeKVCacheSpecs,
856
+ ) -> list[KVCacheGroupSpec]:
857
+ """
858
+ Generates the KV cache configuration for a model with one type of KV cache
859
+ but different hidden sizes. All layers are merged into one group.
860
+
861
+ Args:
862
+ spec: The UniformTypeKVCacheSpecs of the model
863
+
864
+ Returns:
865
+ The generated KVCacheGroupSpecs
866
+ """
867
+
868
+ return [KVCacheGroupSpec(list(spec.kv_cache_specs.keys()), spec)]
869
+
870
+
871
+ def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
872
+ """
873
+ Whether all layers in the given KVCacheSpec have the same page size.
874
+ Args:
875
+ kv_cache_spec: The KVCacheSpec of each attention layer in the model
876
+
877
+ Returns:
878
+ True if all layers have the same page size, False otherwise.
879
+ """
880
+
881
+ page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
882
+ return len(page_sizes) == 1
883
+
884
+
885
+ def unify_kv_cache_spec_page_size(
886
+ kv_cache_spec: dict[str, KVCacheSpec],
887
+ ) -> dict[str, KVCacheSpec]:
888
+ """
889
+ Unify the page size of the given KVCacheSpec. If the page size of all layers
890
+ are the same, return the original KVCacheSpec. If not same, unify the page
891
+ size by increasing the block size of layers with smaller page size. Raise
892
+ NotImplementedError if failed to unify the page size.
893
+
894
+ Args:
895
+ kv_cache_spec: The KVCacheSpec of each attention layer in the model
896
+
897
+ Returns:
898
+ The updated KVCacheSpec with the same page_size_bytes.
899
+ """
900
+ page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
901
+ if len(page_sizes) <= 1:
902
+ # All layers have the same page size, no need to unify.
903
+ return kv_cache_spec
904
+
905
+ max_page_size = max(page_sizes)
906
+ new_kv_cache_spec = {}
907
+ for layer_name, layer_spec in kv_cache_spec.items():
908
+ if layer_spec.page_size_bytes == max_page_size:
909
+ new_kv_cache_spec[layer_name] = layer_spec
910
+ else:
911
+ layer_page_size = layer_spec.page_size_bytes
912
+ if max_page_size % layer_page_size != 0:
913
+ raise NotImplementedError(
914
+ "The page size of the layer is not divisible by the "
915
+ "maximum page size. Cannot unify by adjusting block_size."
916
+ )
917
+ ratio = max_page_size // layer_page_size
918
+ new_block_size = layer_spec.block_size * ratio
919
+ new_spec = replace(layer_spec, block_size=new_block_size)
920
+ assert new_spec.page_size_bytes == max_page_size
921
+ new_kv_cache_spec[layer_name] = new_spec
922
+ return new_kv_cache_spec
923
+
924
+
925
+ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
926
+ # kv_cache_spec is an empty dict for attention free models
927
+ return not kv_cache_spec
928
+
929
+
930
+ def _get_kv_cache_groups_uniform_page_size(
931
+ kv_cache_spec: dict[str, KVCacheSpec],
932
+ ) -> list[KVCacheGroupSpec]:
933
+ """
934
+ Generates the KV cache groups for hybrid models with multiple
935
+ attention types but still with a uniform page size (physical memory per
936
+ block per layer) for all layers.
937
+
938
+ Detailed explanation about kv cache management of hybrid models:
939
+ The layers in the models are repeated with some patterns, e.g., a model
940
+ with 10 full attention layers and 20 sliding window attention layers can be
941
+ regarded as repeating the pattern (1 * full, 2 * sw) 10 times.
942
+ The KVCacheManager allocates different block tables for each of the 3 layers
943
+ in the pattern, and repeats each of them 10 times to generate the
944
+ block_table for the 30 layers in the model.
945
+ Therefore, we can group the layers in the model into 3 kv_cache_groups, each
946
+ of which contains 10 layers in the model.
947
+ The KVCacheManager allocates the block_table for each group based on its
948
+ kv_cache spec, and the model runner applies the block table to each layer
949
+ in the group.
950
+ For example:
951
+ 1. A model only uses full attention. The pattern is
952
+ (num_hidden_layers * full), so there is only one group and the block table
953
+ is shared by all layers. It is already handled by
954
+ `_get_kv_cache_config_uniform_type`.
955
+ 2. A model with 10 full attention layers and 20 sliding window
956
+ attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so
957
+ there are 3 kv_cache_groups, each of which represents 10 layers.
958
+
959
+ To simplify the implementation, we make the following assumptions:
960
+ 1. Physical memory per block: Must be the same across all KV cache groups.
961
+ Breaking this assumption is non-trivial due to memory fragmentation concerns
962
+ when allocating blocks of different sizes.
963
+ 2. Tokens per block (block_size): Currently, we directly use
964
+ `CacheConfig.block_size` for all layers. It can be extended to vary by KV
965
+ cache group, but within each KV cache group, all layers must share the same
966
+ block size.
967
+ 3. Physical memory per token per layer: This property is decided by model
968
+ config. Currently we only support models that have the same physical memory
969
+ per token per layer for all layers. Can be relaxed with a simple extension,
970
+ but still need to keep physical memory per block the same for all groups.
971
+ 4. Number of layers per group: Currently assumed the same for all layers.
972
+ Can be relaxed with a simple extension, but still need to keep physical
973
+ memory per block the same for all groups.
974
+ 5. Attention type within groups: All layers in a group must share the same
975
+ attention type. One exception is that, when
976
+ `--disable-hybrid-kv-cache-manager` is true, the single group for full
977
+ attention layers may also include attention layers using sliding window or
978
+ LLaMA 4 local attention. See `unify_hybrid_kv_cache_specs` for more details.
979
+ 6. Support for multiple attention types: The design for most components is
980
+ general to an arbitrary number of attention types. But
981
+ `find_longest_cache_hit` only supports one attention type or two
982
+ types of full-attention plus exactly one another type. The general
983
+ implementation of this function is feasible but we don't know how to
984
+ implement it cleanly yet.
985
+
986
+ As we assume tokens per block, physical memory per token per layer, and
987
+ number of layers per group are the same now, we can ensure that physical
988
+ memory per block is the same for all groups.
989
+
990
+ Args:
991
+ kv_cache_spec: The KVCacheSpec of each attention layer in the model
992
+ Returns:
993
+ The generated KVCacheGroupSpecs
994
+ """
995
+ # Group all layers by kv_cache_spec.
996
+ # E.g., 2 full attention layers and 3 sliding window attention layers,
997
+ # -> (full.0, full.1), (sw.0, sw.1, sw.2).
998
+ same_type_layers: dict[KVCacheSpec, list[str]] = defaultdict(list)
999
+ for layer_name, layer_spec in kv_cache_spec.items():
1000
+ same_type_layers[layer_spec].append(layer_name)
1001
+
1002
+ # Split each group into smaller groups, to make the number of layers in each
1003
+ # group identical. Add padding to the last group of each type if necessary.
1004
+ # E.g., (full.0, full.1), (sw.0, sw.1, sw.2)
1005
+ # split to 3 groups with 2 layers each:
1006
+ # (full.0, full.1), (sw.0, sw.2), (sw.1, padding).
1007
+ # FIXME(Chen): At the moment of writing this code (2025-06-02), all
1008
+ # open-source hybrid model follows a n:1 pattern between different attention
1009
+ # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and
1010
+ # full), so we can use the "1" in the n:1 pattern as the group size, which
1011
+ # is the minimum number of layers among all attention types. Need a better
1012
+ # strategy if we want to support more complex patterns (e.g., 20 full + 30
1013
+ # sw, where the group size should be 10).
1014
+ min_num_layers = min([len(layers) for layers in same_type_layers.values()])
1015
+ group_size = min_num_layers
1016
+ max_num_layers = max([len(layers) for layers in same_type_layers.values()])
1017
+ if max_num_layers < min_num_layers * 1.25:
1018
+ # If the number of layers is not much larger than the minimum number of layers,
1019
+ # use the maximum number of layers as the group size to avoid too many padding
1020
+ # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
1021
+ # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
1022
+ # magic number to avoid too many padding layers.
1023
+ group_size = max_num_layers
1024
+ grouped_layers = []
1025
+ for layers in same_type_layers.values():
1026
+ num_padding_layers = group_size - len(layers) % group_size
1027
+ if num_padding_layers != group_size:
1028
+ logger.warning(
1029
+ "Add %d padding layers, may waste at most %.2f%% KV cache memory", # noqa
1030
+ num_padding_layers,
1031
+ num_padding_layers / len(layers) * 100,
1032
+ )
1033
+ num_groups = cdiv(len(layers), group_size)
1034
+ # In PP case, say if we have
1035
+ # - stage 0: full.0, sw.0, sw.1
1036
+ # - stage 1: full.1, sw.2, sw.3
1037
+ # We should have 3 groups: (full.0, full.1), (sw.0, sw.2), (sw.1, sw.3)
1038
+ # It can't be (full.0, full.1), (sw.0, sw.1), (sw.2, sw.3) because
1039
+ # the 3 groups in stage 0 will be (full.0), (sw.0, sw.1), (empty group)
1040
+ # and it will be padded to (full.0, padding), (sw.0, sw.1),
1041
+ # (padding, padding) to ensure the number of layers in each group is
1042
+ # the same and will cause memory waste.
1043
+ # To avoid this, we assign layers[i::num_groups] to the i-th group
1044
+ # instead of layers[i * group_size: (i + 1) * group_size]
1045
+ for i in range(num_groups):
1046
+ grouped_layers.append(layers[i::num_groups])
1047
+ return create_kv_cache_group_specs(kv_cache_spec, grouped_layers)
1048
+
1049
+
1050
+ def get_kv_cache_config_from_groups(
1051
+ vllm_config: VllmConfig,
1052
+ kv_cache_groups: list[KVCacheGroupSpec],
1053
+ available_memory: int,
1054
+ ) -> KVCacheConfig:
1055
+ """
1056
+ Generate the KV cache configuration from the KV cache groups and spec
1057
+ of each layer.
1058
+
1059
+ Args:
1060
+ vllm_config: The global VllmConfig
1061
+ kv_cache_groups: The KV cache groups
1062
+ available_memory: Memory available for KV cache in bytes
1063
+ Returns:
1064
+ The generated KVCacheConfig
1065
+ """
1066
+ if len(kv_cache_groups) == 0:
1067
+ # Attention free models do not have KV cache.
1068
+ # Return num_blocks=1 as BlockPool always needs a null_block.
1069
+ return KVCacheConfig(
1070
+ num_blocks=1,
1071
+ kv_cache_tensors=[],
1072
+ kv_cache_groups=kv_cache_groups,
1073
+ )
1074
+
1075
+ # Determine how model runners should initialize the KV cache tensors.
1076
+ if len(kv_cache_groups) == 1 and isinstance(
1077
+ kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
1078
+ ):
1079
+ # Special case: all layers have the same type of KV cache but with
1080
+ # different hidden size. Allocate different amount of memory for each
1081
+ # layer based on its hidden size.
1082
+ num_blocks = (
1083
+ available_memory // kv_cache_groups[0].kv_cache_spec.page_size_bytes
1084
+ )
1085
+ num_blocks = may_override_num_blocks(vllm_config, num_blocks)
1086
+ per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
1087
+ kv_cache_tensors = [
1088
+ KVCacheTensor(
1089
+ size=per_layer_specs[layer_name].page_size_bytes * num_blocks,
1090
+ shared_by=[layer_name],
1091
+ )
1092
+ for layer_name in kv_cache_groups[0].layer_names
1093
+ ]
1094
+ else:
1095
+ # General case:
1096
+ # We will have group_size memory pools, each is shared by one layer from
1097
+ # each group. As layers of different groups have different block table,
1098
+ # they will use different parts of the shared Tensor.
1099
+ # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
1100
+ # (sw.1, padding) will be: (group_size = 2)
1101
+ # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
1102
+ # full.1, sw.2: share another Tensor with size=available_memory//2
1103
+ group_size = max(len(group.layer_names) for group in kv_cache_groups)
1104
+
1105
+ page_size = get_uniform_page_size(
1106
+ [group.kv_cache_spec for group in kv_cache_groups]
1107
+ )
1108
+ assert group_size > 0, "group_size must be greater than 0"
1109
+ num_blocks = get_num_blocks(
1110
+ vllm_config, group_size, available_memory, page_size
1111
+ )
1112
+ kv_cache_tensors = []
1113
+ for i in range(group_size):
1114
+ shared_by = []
1115
+ for j in range(len(kv_cache_groups)):
1116
+ if i < len(kv_cache_groups[j].layer_names):
1117
+ shared_by.append(kv_cache_groups[j].layer_names[i])
1118
+ kv_cache_tensors.append(
1119
+ KVCacheTensor(size=page_size * num_blocks, shared_by=shared_by)
1120
+ )
1121
+
1122
+ return KVCacheConfig(
1123
+ num_blocks=num_blocks,
1124
+ kv_cache_tensors=kv_cache_tensors,
1125
+ kv_cache_groups=kv_cache_groups,
1126
+ )
1127
+
1128
+
1129
+ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
1130
+ """
1131
+ This function tries to convert the KV cache specs to one type if the model
1132
+ is a hybrid model with multiple type of KV cache. It will convert all
1133
+ SlidingWindowSpec to FullAttentionSpec if both types are present.
1134
+
1135
+ Args:
1136
+ kv_cache_spec: The kv cache spec of each attention layer in the model
1137
+ """
1138
+
1139
+ if is_kv_cache_spec_uniform(
1140
+ kv_cache_spec
1141
+ ) or UniformTypeKVCacheSpecs.is_uniform_type(kv_cache_spec):
1142
+ return
1143
+
1144
+ logger.warning(
1145
+ "Hybrid KV cache manager is disabled for this hybrid model, "
1146
+ "This means we do not enable any optimizations for saving KV cache "
1147
+ "memory (e.g., dropping the KV cache outside the sliding window). "
1148
+ "The compute of layers like sliding window is still saved."
1149
+ )
1150
+
1151
+ has_full_attention = any(
1152
+ isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values()
1153
+ )
1154
+ has_sliding_window = any(
1155
+ isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values()
1156
+ )
1157
+ has_chunked_local_attention = any(
1158
+ isinstance(spec, ChunkedLocalAttentionSpec) for spec in kv_cache_spec.values()
1159
+ )
1160
+ if has_full_attention and (has_sliding_window or has_chunked_local_attention):
1161
+ for layer_name, spec in kv_cache_spec.items():
1162
+ if isinstance(spec, SlidingWindowSpec):
1163
+ kv_cache_spec[layer_name] = FullAttentionSpec(
1164
+ block_size=spec.block_size,
1165
+ num_kv_heads=spec.num_kv_heads,
1166
+ head_size=spec.head_size,
1167
+ dtype=spec.dtype,
1168
+ sliding_window=spec.sliding_window,
1169
+ )
1170
+ elif isinstance(spec, ChunkedLocalAttentionSpec):
1171
+ kv_cache_spec[layer_name] = FullAttentionSpec(
1172
+ block_size=spec.block_size,
1173
+ num_kv_heads=spec.num_kv_heads,
1174
+ head_size=spec.head_size,
1175
+ dtype=spec.dtype,
1176
+ attention_chunk_size=spec.attention_chunk_size,
1177
+ )
1178
+
1179
+ if not (
1180
+ is_kv_cache_spec_uniform(kv_cache_spec)
1181
+ or UniformTypeKVCacheSpecs.is_uniform_type(kv_cache_spec)
1182
+ ):
1183
+ raise ValueError(
1184
+ "Hybrid KV cache manager is disabled but failed to "
1185
+ "convert the KV cache specs to one unified type."
1186
+ )
1187
+
1188
+
1189
+ def get_kv_cache_groups(
1190
+ vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec]
1191
+ ) -> list[KVCacheGroupSpec]:
1192
+ """
1193
+ Split the layers in the model into groups with the same KV cache spec.
1194
+
1195
+ Args:
1196
+ vllm_config: The global VllmConfig
1197
+ kv_cache_spec: The kv cache spec of each attention layer in the model
1198
+
1199
+ Returns:
1200
+ The generated KVCacheGroups
1201
+ """
1202
+ if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
1203
+ unify_hybrid_kv_cache_specs(kv_cache_spec)
1204
+
1205
+ if is_kv_cache_type_attention_free(kv_cache_spec):
1206
+ # This returns an empty list to allow for the KVCacheManager to handle
1207
+ # attention free models.
1208
+ return []
1209
+
1210
+ if is_kv_cache_spec_uniform(kv_cache_spec):
1211
+ # KV cache of all layers are the same, which is true for
1212
+ # most models. Allocate the same amount of memory for
1213
+ # each layer.
1214
+ return _get_kv_cache_groups_uniform_spec(kv_cache_spec)
1215
+ elif uniform_spec := UniformTypeKVCacheSpecs.from_specs(kv_cache_spec):
1216
+ # All layers need the same number of token slots (e.g., all layers are
1217
+ # full attention, or all layers are sliding window attention with the
1218
+ # same window size). Put all layers into one group.
1219
+ return _get_kv_cache_groups_uniform_type(uniform_spec)
1220
+
1221
+ # As KVCacheManager can only allocate memory of one size, we need to unify
1222
+ # the page size of the layers. For cases cannot be unified, this function
1223
+ # will raise an error.
1224
+ kv_cache_spec = unify_kv_cache_spec_page_size(kv_cache_spec)
1225
+ # Model contains multiple attention types, but KV cache of all layers
1226
+ # have the same physical memory per block per layer. Split the layers
1227
+ # into groups with the same number of layers, and thus same total page
1228
+ # size.
1229
+ return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
1230
+
1231
+
1232
+ def generate_scheduler_kv_cache_config(
1233
+ kv_cache_configs: list[KVCacheConfig],
1234
+ ) -> KVCacheConfig:
1235
+ """
1236
+ Generate the KV cache configuration for the scheduler.
1237
+ """
1238
+ assert all(
1239
+ [cfg.num_blocks == kv_cache_configs[0].num_blocks for cfg in kv_cache_configs]
1240
+ )
1241
+ # All workers have the same kv_cache_config except layer names, so use
1242
+ # an arbitrary one to initialize the scheduler.
1243
+ cfg = copy.deepcopy(kv_cache_configs[0])
1244
+ for group in cfg.kv_cache_groups:
1245
+ if isinstance(group.kv_cache_spec, UniformTypeKVCacheSpecs):
1246
+ # All layers in the UniformTypeKVCacheSpecs have the same type,
1247
+ # so use an arbitrary one to initialize the scheduler.
1248
+ group.kv_cache_spec = next(
1249
+ iter(group.kv_cache_spec.kv_cache_specs.values())
1250
+ )
1251
+ return cfg
1252
+
1253
+
1254
+ def _report_kv_cache_config(
1255
+ vllm_config: VllmConfig, kv_cache_config: KVCacheConfig
1256
+ ) -> None:
1257
+ """
1258
+ Log resolved KV cache configuration.
1259
+
1260
+ Args:
1261
+ vllm_config: The global VllmConfig
1262
+ kv_cache_config: The resolved KV cache configuration
1263
+ """
1264
+ min_block_size = min(
1265
+ [group.kv_cache_spec.block_size for group in kv_cache_config.kv_cache_groups]
1266
+ )
1267
+
1268
+ # Log the KV cache size and maximum concurrency.
1269
+ num_tokens = (
1270
+ kv_cache_config.num_blocks
1271
+ // len(kv_cache_config.kv_cache_groups)
1272
+ * min_block_size
1273
+ )
1274
+ dcp_size = vllm_config.parallel_config.decode_context_parallel_size
1275
+ pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
1276
+ if pcp_size * dcp_size > 1:
1277
+ num_tokens *= pcp_size * dcp_size
1278
+ logger.info(
1279
+ "Multiplying the GPU KV cache size by the cp_world_size %d "
1280
+ "(pcp_world_size %d * dcp_world_size %d).",
1281
+ pcp_size * dcp_size,
1282
+ pcp_size,
1283
+ dcp_size,
1284
+ )
1285
+ num_tokens_str = f"{num_tokens:,}"
1286
+ logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local")
1287
+ max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
1288
+ max_concurrency = get_max_concurrency_for_kv_cache_config(
1289
+ vllm_config, kv_cache_config
1290
+ )
1291
+ logger.info_once(
1292
+ "Maximum concurrency for %s tokens per request: %.2fx",
1293
+ max_model_len_str,
1294
+ max_concurrency,
1295
+ scope="local",
1296
+ )
1297
+
1298
+
1299
+ def get_kv_cache_configs(
1300
+ vllm_config: VllmConfig,
1301
+ kv_cache_specs: list[dict[str, KVCacheSpec]],
1302
+ available_memory: list[int],
1303
+ ) -> list[KVCacheConfig]:
1304
+ """
1305
+ Generates the KV cache configurations for a model.
1306
+ Since we use a shared centralized controller for all workers, we need the
1307
+ `kv_cache_config` to be consistent across all workers to make sure
1308
+ the KV cache allocation can be applied to all workers. However, different
1309
+ workers may have different memory available, and different type of layers
1310
+ (when pipeline parallel is enabled). To handle the difference between
1311
+ workers, the current implementation is:
1312
+ 1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
1313
+ the whole model.
1314
+ 2. Generate the KV cache groups based on the layer ratio of the whole model.
1315
+ 3. Generate the KV cache configs for each worker based on the KV cache
1316
+ grouping strategy. (This is reasonable because the layer ratio of
1317
+ different PP stages are similar.)
1318
+ 4. Change the num_blocks of each worker to the smallest among all workers
1319
+ and shrink tensor sizes proportionally to avoid allocating unused memory.
1320
+
1321
+ Args:
1322
+ vllm_config: The global VllmConfig
1323
+ kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker.
1324
+ available_memory: Memory available for KV cache in bytes for each
1325
+ worker.
1326
+
1327
+ Returns:
1328
+ The generated KVCacheConfigs for each worker.
1329
+ """
1330
+
1331
+ # Check if the available memory is enough for each worker.
1332
+ for kv_cache_spec_one_worker, available_memory_one_worker in zip(
1333
+ kv_cache_specs, available_memory
1334
+ ):
1335
+ check_enough_kv_cache_memory(
1336
+ vllm_config, kv_cache_spec_one_worker, available_memory_one_worker
1337
+ )
1338
+
1339
+ # Merge the KV cache specs of all workers. Different PP stages may have
1340
+ # different layer names, and different TP ranks of the same PP stage should
1341
+ # have the same KV cache spec.
1342
+ merged_kv_cache_specs: dict[str, KVCacheSpec] = {}
1343
+ for kv_cache_spec_one_worker in kv_cache_specs:
1344
+ for layer_name, layer_spec in kv_cache_spec_one_worker.items():
1345
+ if layer_name not in merged_kv_cache_specs:
1346
+ merged_kv_cache_specs[layer_name] = layer_spec
1347
+ else:
1348
+ assert merged_kv_cache_specs[layer_name] == layer_spec, (
1349
+ "The KV cache specs for the same layer are different "
1350
+ "across workers. This is not supported yet."
1351
+ )
1352
+ global_kv_cache_groups = get_kv_cache_groups(vllm_config, merged_kv_cache_specs)
1353
+
1354
+ kv_cache_configs: list[KVCacheConfig] = []
1355
+ for kv_cache_spec_one_worker, available_memory_one_worker in zip(
1356
+ kv_cache_specs, available_memory
1357
+ ):
1358
+ kv_cache_groups_one_worker: list[KVCacheGroupSpec] = []
1359
+ for group in global_kv_cache_groups:
1360
+ group_layer_names_one_worker = [
1361
+ layer_name
1362
+ for layer_name in group.layer_names
1363
+ if layer_name in kv_cache_spec_one_worker
1364
+ ]
1365
+ kv_cache_groups_one_worker.append(
1366
+ KVCacheGroupSpec(group_layer_names_one_worker, group.kv_cache_spec)
1367
+ )
1368
+ assert sum(
1369
+ len(group.layer_names) for group in kv_cache_groups_one_worker
1370
+ ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group."
1371
+ kv_cache_configs.append(
1372
+ get_kv_cache_config_from_groups(
1373
+ vllm_config, kv_cache_groups_one_worker, available_memory_one_worker
1374
+ )
1375
+ )
1376
+
1377
+ # Change the num_blocks of each rank to the smallest among all ranks.
1378
+ # We also need to shrink the tensor size proportionally to avoid
1379
+ # allocating unused memory.
1380
+ min_num_blocks = min(
1381
+ kv_cache_config.num_blocks for kv_cache_config in kv_cache_configs
1382
+ )
1383
+ for kv_cache_config in kv_cache_configs:
1384
+ num_blocks_old = kv_cache_config.num_blocks
1385
+ kv_cache_config.num_blocks = min_num_blocks
1386
+
1387
+ # Shrink tensor size proportionally
1388
+ for tensor in kv_cache_config.kv_cache_tensors:
1389
+ assert tensor.size % num_blocks_old == 0
1390
+ tensor.size = tensor.size // num_blocks_old * min_num_blocks
1391
+
1392
+ if len(kv_cache_config.kv_cache_groups) > 0:
1393
+ _report_kv_cache_config(vllm_config, kv_cache_config)
1394
+
1395
+ return kv_cache_configs
1396
+
1397
+
1398
+ class BlockHashListWithBlockSize:
1399
+ """
1400
+ Convert block-hash granularity from `hash_block_size` to `target_block_size`.
1401
+ Used when KV cache groups have different block sizes: `hash_block_size`
1402
+ is the size used to compute the original `block_hashes`; `target_block_size`
1403
+ is the group's actual block size.
1404
+
1405
+ Currently, only scaling up by an integer factor is supported (i.e.,
1406
+ `target_block_size` is a multiple of `hash_block_size`). Conversion is
1407
+ performed lazily on access for efficiency, by concatenating consecutive
1408
+ hashes at `hash_block_size` to form each hash at `target_block_size`.
1409
+
1410
+ Example (`hash_block_size` = 16, `target_block_size` = 32):
1411
+ concatenating two 16-size hashes yields one 32-size hash:
1412
+
1413
+ Block hashes with block_size 16:
1414
+ | Token Range | 0-15 | 16-31 | 32-47 | 48-63 |
1415
+ |-------------|------|-------|-------|-------|
1416
+ | Hash | A | B | C | D |
1417
+
1418
+ Block hashes with block_size 32:
1419
+ | Token Range | 0-31 | 32-63 |
1420
+ |-------------|------|-------|
1421
+ | Hash | AB | CD |
1422
+
1423
+ Args:
1424
+ block_hashes: Block hashes to convert, computed at `hash_block_size`.
1425
+ hash_block_size: Block size at which `block_hashes` were computed.
1426
+ target_block_size: Desired block size; must be a multiple of `hash_block_size`.
1427
+ """
1428
+
1429
+ def __init__(
1430
+ self,
1431
+ block_hashes: list[BlockHash],
1432
+ hash_block_size: int,
1433
+ target_block_size: int,
1434
+ ):
1435
+ self.block_hashes = block_hashes
1436
+ assert target_block_size % hash_block_size == 0
1437
+ self.scale_factor = target_block_size // hash_block_size
1438
+
1439
+ def __len__(self) -> int:
1440
+ return len(self.block_hashes) // self.scale_factor
1441
+
1442
+ @overload
1443
+ def __getitem__(self, idx: int) -> BlockHash: ...
1444
+
1445
+ @overload
1446
+ def __getitem__(self, idx: slice) -> list[BlockHash]: ...
1447
+
1448
+ def __getitem__(self, idx):
1449
+ if isinstance(idx, int):
1450
+ return self._get_value_at(idx)
1451
+
1452
+ if isinstance(idx, slice):
1453
+ start, stop, step = idx.indices(len(self))
1454
+ return [self._get_value_at(i) for i in range(start, stop, step)]
1455
+
1456
+ raise TypeError(f"Invalid index type: {type(idx)!r}")
1457
+
1458
+ def __iter__(self) -> Iterator[BlockHash]:
1459
+ for i in range(len(self)):
1460
+ yield self._get_value_at(i)
1461
+
1462
+ def _get_value_at(self, idx: int) -> BlockHash:
1463
+ base = idx * self.scale_factor
1464
+ end = base + self.scale_factor
1465
+ merged_hash: bytes = self.block_hashes[base]
1466
+ for i in range(base + 1, end):
1467
+ merged_hash += self.block_hashes[i]
1468
+ return BlockHash(merged_hash)
1469
+
1470
+
1471
+ BlockHashList = list[BlockHash] | BlockHashListWithBlockSize