vllm-cpu-avx512bf16 0.14.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1712) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +1511 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +3206 -0
  6. vllm/_ipex_ops.py +445 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +62 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/layer.py +913 -0
  15. vllm/attention/utils/__init__.py +0 -0
  16. vllm/attention/utils/kv_sharing_utils.py +33 -0
  17. vllm/attention/utils/kv_transfer_utils.py +60 -0
  18. vllm/beam_search.py +88 -0
  19. vllm/benchmarks/__init__.py +0 -0
  20. vllm/benchmarks/datasets.py +3277 -0
  21. vllm/benchmarks/latency.py +172 -0
  22. vllm/benchmarks/lib/__init__.py +3 -0
  23. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  24. vllm/benchmarks/lib/ready_checker.py +72 -0
  25. vllm/benchmarks/lib/utils.py +79 -0
  26. vllm/benchmarks/mm_processor.py +363 -0
  27. vllm/benchmarks/serve.py +1761 -0
  28. vllm/benchmarks/startup.py +321 -0
  29. vllm/benchmarks/sweep/__init__.py +0 -0
  30. vllm/benchmarks/sweep/cli.py +41 -0
  31. vllm/benchmarks/sweep/param_sweep.py +159 -0
  32. vllm/benchmarks/sweep/plot.py +675 -0
  33. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  34. vllm/benchmarks/sweep/serve.py +450 -0
  35. vllm/benchmarks/sweep/serve_sla.py +459 -0
  36. vllm/benchmarks/sweep/server.py +114 -0
  37. vllm/benchmarks/sweep/sla_sweep.py +138 -0
  38. vllm/benchmarks/sweep/utils.py +4 -0
  39. vllm/benchmarks/throughput.py +946 -0
  40. vllm/collect_env.py +857 -0
  41. vllm/compilation/__init__.py +0 -0
  42. vllm/compilation/activation_quant_fusion.py +214 -0
  43. vllm/compilation/backends.py +840 -0
  44. vllm/compilation/base_static_graph.py +57 -0
  45. vllm/compilation/caching.py +196 -0
  46. vllm/compilation/collective_fusion.py +1224 -0
  47. vllm/compilation/compiler_interface.py +639 -0
  48. vllm/compilation/counter.py +50 -0
  49. vllm/compilation/cuda_graph.py +309 -0
  50. vllm/compilation/decorators.py +662 -0
  51. vllm/compilation/fix_functionalization.py +266 -0
  52. vllm/compilation/fusion.py +570 -0
  53. vllm/compilation/fusion_attn.py +363 -0
  54. vllm/compilation/fx_utils.py +92 -0
  55. vllm/compilation/inductor_pass.py +145 -0
  56. vllm/compilation/matcher_utils.py +454 -0
  57. vllm/compilation/monitor.py +62 -0
  58. vllm/compilation/noop_elimination.py +130 -0
  59. vllm/compilation/partition_rules.py +75 -0
  60. vllm/compilation/pass_manager.py +164 -0
  61. vllm/compilation/piecewise_backend.py +191 -0
  62. vllm/compilation/post_cleanup.py +21 -0
  63. vllm/compilation/qk_norm_rope_fusion.py +244 -0
  64. vllm/compilation/rocm_aiter_fusion.py +401 -0
  65. vllm/compilation/sequence_parallelism.py +368 -0
  66. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  67. vllm/compilation/vllm_inductor_pass.py +180 -0
  68. vllm/compilation/wrapper.py +329 -0
  69. vllm/config/__init__.py +112 -0
  70. vllm/config/attention.py +114 -0
  71. vllm/config/cache.py +233 -0
  72. vllm/config/compilation.py +1149 -0
  73. vllm/config/device.py +75 -0
  74. vllm/config/ec_transfer.py +110 -0
  75. vllm/config/kv_events.py +56 -0
  76. vllm/config/kv_transfer.py +119 -0
  77. vllm/config/load.py +124 -0
  78. vllm/config/lora.py +102 -0
  79. vllm/config/model.py +2026 -0
  80. vllm/config/model_arch.py +57 -0
  81. vllm/config/multimodal.py +247 -0
  82. vllm/config/observability.py +157 -0
  83. vllm/config/parallel.py +703 -0
  84. vllm/config/pooler.py +188 -0
  85. vllm/config/profiler.py +199 -0
  86. vllm/config/scheduler.py +298 -0
  87. vllm/config/speculative.py +656 -0
  88. vllm/config/speech_to_text.py +39 -0
  89. vllm/config/structured_outputs.py +78 -0
  90. vllm/config/utils.py +374 -0
  91. vllm/config/vllm.py +1487 -0
  92. vllm/connections.py +189 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +301 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +43 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +509 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +303 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +346 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +190 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  106. vllm/distributed/device_communicators/pynccl.py +386 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +567 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +778 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  113. vllm/distributed/device_communicators/symm_mem.py +156 -0
  114. vllm/distributed/device_communicators/xpu_communicator.py +98 -0
  115. vllm/distributed/ec_transfer/__init__.py +14 -0
  116. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  117. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  118. vllm/distributed/ec_transfer/ec_connector/example_connector.py +201 -0
  119. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  120. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  121. vllm/distributed/eplb/__init__.py +3 -0
  122. vllm/distributed/eplb/async_worker.py +115 -0
  123. vllm/distributed/eplb/eplb_state.py +1192 -0
  124. vllm/distributed/eplb/policy/__init__.py +19 -0
  125. vllm/distributed/eplb/policy/abstract.py +43 -0
  126. vllm/distributed/eplb/policy/default.py +376 -0
  127. vllm/distributed/eplb/rebalance_execute.py +699 -0
  128. vllm/distributed/kv_events.py +505 -0
  129. vllm/distributed/kv_transfer/README.md +29 -0
  130. vllm/distributed/kv_transfer/__init__.py +20 -0
  131. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  132. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  133. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  134. vllm/distributed/kv_transfer/kv_connector/factory.py +203 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +459 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +607 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +450 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +344 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  142. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +395 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +211 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1431 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +941 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +186 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +916 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/moriio/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +321 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +1515 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +609 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +477 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2688 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +557 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  159. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  160. vllm/distributed/parallel_state.py +1809 -0
  161. vllm/distributed/utils.py +545 -0
  162. vllm/engine/__init__.py +0 -0
  163. vllm/engine/arg_utils.py +2137 -0
  164. vllm/engine/async_llm_engine.py +6 -0
  165. vllm/engine/llm_engine.py +6 -0
  166. vllm/engine/protocol.py +194 -0
  167. vllm/entrypoints/__init__.py +0 -0
  168. vllm/entrypoints/anthropic/__init__.py +0 -0
  169. vllm/entrypoints/anthropic/protocol.py +162 -0
  170. vllm/entrypoints/anthropic/serving_messages.py +468 -0
  171. vllm/entrypoints/api_server.py +186 -0
  172. vllm/entrypoints/chat_utils.py +1912 -0
  173. vllm/entrypoints/cli/__init__.py +19 -0
  174. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/base.py +25 -0
  176. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  177. vllm/entrypoints/cli/benchmark/main.py +57 -0
  178. vllm/entrypoints/cli/benchmark/mm_processor.py +21 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  180. vllm/entrypoints/cli/benchmark/startup.py +21 -0
  181. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  182. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  183. vllm/entrypoints/cli/collect_env.py +38 -0
  184. vllm/entrypoints/cli/main.py +79 -0
  185. vllm/entrypoints/cli/openai.py +260 -0
  186. vllm/entrypoints/cli/run_batch.py +68 -0
  187. vllm/entrypoints/cli/serve.py +253 -0
  188. vllm/entrypoints/cli/types.py +29 -0
  189. vllm/entrypoints/constants.py +12 -0
  190. vllm/entrypoints/context.py +898 -0
  191. vllm/entrypoints/grpc_server.py +531 -0
  192. vllm/entrypoints/launcher.py +175 -0
  193. vllm/entrypoints/llm.py +1807 -0
  194. vllm/entrypoints/logger.py +86 -0
  195. vllm/entrypoints/openai/__init__.py +0 -0
  196. vllm/entrypoints/openai/api_server.py +1390 -0
  197. vllm/entrypoints/openai/cli_args.py +320 -0
  198. vllm/entrypoints/openai/orca_metrics.py +120 -0
  199. vllm/entrypoints/openai/parser/__init__.py +0 -0
  200. vllm/entrypoints/openai/parser/harmony_utils.py +820 -0
  201. vllm/entrypoints/openai/parser/responses_parser.py +176 -0
  202. vllm/entrypoints/openai/protocol.py +2566 -0
  203. vllm/entrypoints/openai/run_batch.py +635 -0
  204. vllm/entrypoints/openai/serving_chat.py +1897 -0
  205. vllm/entrypoints/openai/serving_chat_stream_harmony.py +101 -0
  206. vllm/entrypoints/openai/serving_completion.py +740 -0
  207. vllm/entrypoints/openai/serving_engine.py +1612 -0
  208. vllm/entrypoints/openai/serving_models.py +309 -0
  209. vllm/entrypoints/openai/serving_responses.py +2552 -0
  210. vllm/entrypoints/openai/serving_transcription.py +168 -0
  211. vllm/entrypoints/openai/speech_to_text.py +711 -0
  212. vllm/entrypoints/openai/utils.py +49 -0
  213. vllm/entrypoints/pooling/__init__.py +16 -0
  214. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  215. vllm/entrypoints/pooling/classify/api_router.py +48 -0
  216. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  217. vllm/entrypoints/pooling/classify/serving.py +233 -0
  218. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  219. vllm/entrypoints/pooling/embed/api_router.py +65 -0
  220. vllm/entrypoints/pooling/embed/conftest.py +28 -0
  221. vllm/entrypoints/pooling/embed/protocol.py +217 -0
  222. vllm/entrypoints/pooling/embed/serving.py +684 -0
  223. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  224. vllm/entrypoints/pooling/pooling/api_router.py +62 -0
  225. vllm/entrypoints/pooling/pooling/protocol.py +146 -0
  226. vllm/entrypoints/pooling/pooling/serving.py +354 -0
  227. vllm/entrypoints/pooling/score/__init__.py +0 -0
  228. vllm/entrypoints/pooling/score/api_router.py +147 -0
  229. vllm/entrypoints/pooling/score/protocol.py +146 -0
  230. vllm/entrypoints/pooling/score/serving.py +511 -0
  231. vllm/entrypoints/renderer.py +411 -0
  232. vllm/entrypoints/responses_utils.py +218 -0
  233. vllm/entrypoints/sagemaker/__init__.py +4 -0
  234. vllm/entrypoints/sagemaker/routes.py +118 -0
  235. vllm/entrypoints/score_utils.py +271 -0
  236. vllm/entrypoints/serve/__init__.py +94 -0
  237. vllm/entrypoints/serve/cache/__init__.py +0 -0
  238. vllm/entrypoints/serve/cache/api_router.py +61 -0
  239. vllm/entrypoints/serve/disagg/__init__.py +0 -0
  240. vllm/entrypoints/serve/disagg/api_router.py +109 -0
  241. vllm/entrypoints/serve/disagg/protocol.py +90 -0
  242. vllm/entrypoints/serve/disagg/serving.py +285 -0
  243. vllm/entrypoints/serve/elastic_ep/__init__.py +0 -0
  244. vllm/entrypoints/serve/elastic_ep/api_router.py +96 -0
  245. vllm/entrypoints/serve/elastic_ep/middleware.py +49 -0
  246. vllm/entrypoints/serve/instrumentator/__init__.py +0 -0
  247. vllm/entrypoints/serve/instrumentator/health.py +33 -0
  248. vllm/entrypoints/serve/instrumentator/metrics.py +45 -0
  249. vllm/entrypoints/serve/instrumentator/offline_docs.py +50 -0
  250. vllm/entrypoints/serve/instrumentator/server_info.py +56 -0
  251. vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js +2 -0
  252. vllm/entrypoints/serve/instrumentator/static/swagger-ui.css +3 -0
  253. vllm/entrypoints/serve/lora/__init__.py +0 -0
  254. vllm/entrypoints/serve/lora/api_router.py +70 -0
  255. vllm/entrypoints/serve/profile/__init__.py +0 -0
  256. vllm/entrypoints/serve/profile/api_router.py +46 -0
  257. vllm/entrypoints/serve/rlhf/__init__.py +0 -0
  258. vllm/entrypoints/serve/rlhf/api_router.py +102 -0
  259. vllm/entrypoints/serve/rpc/__init__.py +0 -0
  260. vllm/entrypoints/serve/rpc/api_router.py +61 -0
  261. vllm/entrypoints/serve/sleep/__init__.py +0 -0
  262. vllm/entrypoints/serve/sleep/api_router.py +56 -0
  263. vllm/entrypoints/serve/tokenize/__init__.py +0 -0
  264. vllm/entrypoints/serve/tokenize/api_router.py +112 -0
  265. vllm/entrypoints/serve/tokenize/serving.py +204 -0
  266. vllm/entrypoints/ssl.py +78 -0
  267. vllm/entrypoints/tool.py +187 -0
  268. vllm/entrypoints/tool_server.py +234 -0
  269. vllm/entrypoints/utils.py +336 -0
  270. vllm/env_override.py +402 -0
  271. vllm/envs.py +1791 -0
  272. vllm/exceptions.py +36 -0
  273. vllm/forward_context.py +375 -0
  274. vllm/grpc/__init__.py +17 -0
  275. vllm/grpc/compile_protos.py +94 -0
  276. vllm/grpc/vllm_engine.proto +195 -0
  277. vllm/grpc/vllm_engine_pb2.py +77 -0
  278. vllm/grpc/vllm_engine_pb2.pyi +213 -0
  279. vllm/grpc/vllm_engine_pb2_grpc.py +330 -0
  280. vllm/inputs/__init__.py +44 -0
  281. vllm/inputs/data.py +359 -0
  282. vllm/inputs/parse.py +147 -0
  283. vllm/inputs/preprocess.py +716 -0
  284. vllm/logger.py +303 -0
  285. vllm/logging_utils/__init__.py +13 -0
  286. vllm/logging_utils/dump_input.py +83 -0
  287. vllm/logging_utils/formatter.py +127 -0
  288. vllm/logging_utils/lazy.py +20 -0
  289. vllm/logging_utils/log_time.py +34 -0
  290. vllm/logits_process.py +121 -0
  291. vllm/logprobs.py +206 -0
  292. vllm/lora/__init__.py +0 -0
  293. vllm/lora/layers/__init__.py +43 -0
  294. vllm/lora/layers/base.py +66 -0
  295. vllm/lora/layers/base_linear.py +172 -0
  296. vllm/lora/layers/column_parallel_linear.py +577 -0
  297. vllm/lora/layers/fused_moe.py +739 -0
  298. vllm/lora/layers/logits_processor.py +203 -0
  299. vllm/lora/layers/replicated_linear.py +70 -0
  300. vllm/lora/layers/row_parallel_linear.py +176 -0
  301. vllm/lora/layers/utils.py +115 -0
  302. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  303. vllm/lora/lora_model.py +221 -0
  304. vllm/lora/lora_weights.py +227 -0
  305. vllm/lora/model_manager.py +858 -0
  306. vllm/lora/ops/__init__.py +0 -0
  307. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  308. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  309. vllm/lora/ops/torch_ops/__init__.py +20 -0
  310. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  311. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  312. vllm/lora/ops/triton_ops/__init__.py +21 -0
  313. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +677 -0
  314. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  315. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  316. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  317. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  318. vllm/lora/ops/triton_ops/utils.py +313 -0
  319. vllm/lora/peft_helper.py +128 -0
  320. vllm/lora/punica_wrapper/__init__.py +10 -0
  321. vllm/lora/punica_wrapper/punica_base.py +493 -0
  322. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  323. vllm/lora/punica_wrapper/punica_gpu.py +413 -0
  324. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  325. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  326. vllm/lora/punica_wrapper/utils.py +150 -0
  327. vllm/lora/request.py +60 -0
  328. vllm/lora/resolver.py +88 -0
  329. vllm/lora/utils.py +281 -0
  330. vllm/lora/worker_manager.py +278 -0
  331. vllm/model_executor/__init__.py +9 -0
  332. vllm/model_executor/custom_op.py +203 -0
  333. vllm/model_executor/layers/__init__.py +0 -0
  334. vllm/model_executor/layers/activation.py +628 -0
  335. vllm/model_executor/layers/attention/__init__.py +0 -0
  336. vllm/model_executor/layers/attention/chunked_local_attention.py +130 -0
  337. vllm/model_executor/layers/attention/cross_attention.py +182 -0
  338. vllm/model_executor/layers/attention/encoder_only_attention.py +103 -0
  339. vllm/model_executor/layers/attention/mm_encoder_attention.py +234 -0
  340. vllm/model_executor/layers/attention/static_sink_attention.py +254 -0
  341. vllm/model_executor/layers/attention_layer_base.py +34 -0
  342. vllm/model_executor/layers/batch_invariant.py +1063 -0
  343. vllm/model_executor/layers/conv.py +262 -0
  344. vllm/model_executor/layers/fla/__init__.py +8 -0
  345. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  346. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  347. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  348. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  349. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  350. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  351. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  352. vllm/model_executor/layers/fla/ops/index.py +41 -0
  353. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  354. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  355. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  356. vllm/model_executor/layers/fla/ops/op.py +60 -0
  357. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  358. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  359. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  360. vllm/model_executor/layers/fused_moe/__init__.py +120 -0
  361. vllm/model_executor/layers/fused_moe/all2all_utils.py +173 -0
  362. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +411 -0
  363. vllm/model_executor/layers/fused_moe/config.py +1111 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json +147 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  645. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  646. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  647. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  648. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  649. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  650. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  651. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +444 -0
  652. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1086 -0
  653. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +364 -0
  654. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +427 -0
  655. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  656. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +436 -0
  657. vllm/model_executor/layers/fused_moe/fallback.py +127 -0
  658. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +338 -0
  659. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +310 -0
  660. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +371 -0
  661. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  662. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1018 -0
  663. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +824 -0
  664. vllm/model_executor/layers/fused_moe/fused_moe.py +2638 -0
  665. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +119 -0
  666. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +117 -0
  667. vllm/model_executor/layers/fused_moe/fused_moe_router.py +40 -0
  668. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +531 -0
  669. vllm/model_executor/layers/fused_moe/layer.py +2169 -0
  670. vllm/model_executor/layers/fused_moe/modular_kernel.py +1251 -0
  671. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +192 -0
  672. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  673. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  674. vllm/model_executor/layers/fused_moe/oracle/__init__.py +2 -0
  675. vllm/model_executor/layers/fused_moe/oracle/fp8.py +358 -0
  676. vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +280 -0
  677. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  678. vllm/model_executor/layers/fused_moe/prepare_finalize.py +87 -0
  679. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +347 -0
  680. vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +324 -0
  681. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  682. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  683. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  684. vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py +78 -0
  685. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +75 -0
  686. vllm/model_executor/layers/fused_moe/trtllm_moe.py +144 -0
  687. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +403 -0
  688. vllm/model_executor/layers/fused_moe/utils.py +382 -0
  689. vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +189 -0
  690. vllm/model_executor/layers/kda.py +442 -0
  691. vllm/model_executor/layers/layernorm.py +451 -0
  692. vllm/model_executor/layers/lightning_attn.py +735 -0
  693. vllm/model_executor/layers/linear.py +1478 -0
  694. vllm/model_executor/layers/logits_processor.py +109 -0
  695. vllm/model_executor/layers/mamba/__init__.py +0 -0
  696. vllm/model_executor/layers/mamba/abstract.py +68 -0
  697. vllm/model_executor/layers/mamba/linear_attn.py +410 -0
  698. vllm/model_executor/layers/mamba/mamba_mixer.py +541 -0
  699. vllm/model_executor/layers/mamba/mamba_mixer2.py +936 -0
  700. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  701. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  702. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  703. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  704. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +586 -0
  705. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  706. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  707. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  708. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  709. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  710. vllm/model_executor/layers/mamba/short_conv.py +254 -0
  711. vllm/model_executor/layers/mla.py +179 -0
  712. vllm/model_executor/layers/pooler/__init__.py +5 -0
  713. vllm/model_executor/layers/pooler/abstract.py +39 -0
  714. vllm/model_executor/layers/pooler/activations.py +162 -0
  715. vllm/model_executor/layers/pooler/common.py +32 -0
  716. vllm/model_executor/layers/pooler/seqwise/__init__.py +45 -0
  717. vllm/model_executor/layers/pooler/seqwise/heads.py +151 -0
  718. vllm/model_executor/layers/pooler/seqwise/methods.py +93 -0
  719. vllm/model_executor/layers/pooler/seqwise/poolers.py +127 -0
  720. vllm/model_executor/layers/pooler/special.py +128 -0
  721. vllm/model_executor/layers/pooler/tokwise/__init__.py +39 -0
  722. vllm/model_executor/layers/pooler/tokwise/heads.py +133 -0
  723. vllm/model_executor/layers/pooler/tokwise/methods.py +122 -0
  724. vllm/model_executor/layers/pooler/tokwise/poolers.py +127 -0
  725. vllm/model_executor/layers/quantization/__init__.py +195 -0
  726. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  727. vllm/model_executor/layers/quantization/awq.py +277 -0
  728. vllm/model_executor/layers/quantization/awq_marlin.py +795 -0
  729. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  730. vllm/model_executor/layers/quantization/base_config.py +170 -0
  731. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  732. vllm/model_executor/layers/quantization/bitsandbytes.py +631 -0
  733. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  734. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +982 -0
  735. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2368 -0
  736. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +37 -0
  737. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  738. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  739. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  740. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py +106 -0
  741. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  742. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  743. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +176 -0
  744. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  745. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  746. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +203 -0
  747. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  748. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  749. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  750. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  751. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  752. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  753. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  754. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  755. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  756. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  757. vllm/model_executor/layers/quantization/cpu_wna16.py +299 -0
  758. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  759. vllm/model_executor/layers/quantization/experts_int8.py +209 -0
  760. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  761. vllm/model_executor/layers/quantization/fp8.py +1224 -0
  762. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  763. vllm/model_executor/layers/quantization/gguf.py +682 -0
  764. vllm/model_executor/layers/quantization/gptq.py +393 -0
  765. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  766. vllm/model_executor/layers/quantization/gptq_marlin.py +934 -0
  767. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  768. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  769. vllm/model_executor/layers/quantization/inc.py +65 -0
  770. vllm/model_executor/layers/quantization/input_quant_fp8.py +212 -0
  771. vllm/model_executor/layers/quantization/ipex_quant.py +403 -0
  772. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  773. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  774. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +113 -0
  775. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  776. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  777. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  778. vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py +126 -0
  779. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +130 -0
  780. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  781. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +168 -0
  782. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  783. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  784. vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py +97 -0
  785. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +76 -0
  786. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +77 -0
  787. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +128 -0
  788. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +220 -0
  789. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +147 -0
  790. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +88 -0
  791. vllm/model_executor/layers/quantization/kv_cache.py +153 -0
  792. vllm/model_executor/layers/quantization/modelopt.py +1665 -0
  793. vllm/model_executor/layers/quantization/moe_wna16.py +518 -0
  794. vllm/model_executor/layers/quantization/mxfp4.py +1145 -0
  795. vllm/model_executor/layers/quantization/petit.py +319 -0
  796. vllm/model_executor/layers/quantization/ptpc_fp8.py +140 -0
  797. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  798. vllm/model_executor/layers/quantization/quark/quark.py +570 -0
  799. vllm/model_executor/layers/quantization/quark/quark_moe.py +797 -0
  800. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  801. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  802. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  803. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  804. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  805. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  806. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  807. vllm/model_executor/layers/quantization/rtn.py +626 -0
  808. vllm/model_executor/layers/quantization/schema.py +90 -0
  809. vllm/model_executor/layers/quantization/torchao.py +380 -0
  810. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  811. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  812. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1002. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1003. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1004. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1005. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1006. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1007. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1008. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1009. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1010. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1011. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1012. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1013. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1014. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1015. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1016. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1017. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1018. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1019. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1020. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1021. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1022. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1023. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1024. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1025. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1026. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1027. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1028. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +514 -0
  1029. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +370 -0
  1030. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1658 -0
  1031. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1032. vllm/model_executor/layers/quantization/utils/int8_utils.py +477 -0
  1033. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1034. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1035. vllm/model_executor/layers/quantization/utils/marlin_utils.py +720 -0
  1036. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +565 -0
  1037. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1038. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1039. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1040. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +189 -0
  1041. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1042. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1043. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1044. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1045. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1046. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1047. vllm/model_executor/layers/quantization/utils/quant_utils.py +767 -0
  1048. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +519 -0
  1049. vllm/model_executor/layers/resampler.py +283 -0
  1050. vllm/model_executor/layers/rotary_embedding/__init__.py +291 -0
  1051. vllm/model_executor/layers/rotary_embedding/base.py +282 -0
  1052. vllm/model_executor/layers/rotary_embedding/common.py +289 -0
  1053. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +184 -0
  1054. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +218 -0
  1055. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1056. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1057. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +82 -0
  1058. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1059. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1060. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +83 -0
  1061. vllm/model_executor/layers/rotary_embedding/mrope.py +412 -0
  1062. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1063. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1064. vllm/model_executor/layers/rotary_embedding/xdrope.py +160 -0
  1065. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1066. vllm/model_executor/layers/utils.py +251 -0
  1067. vllm/model_executor/layers/vocab_parallel_embedding.py +564 -0
  1068. vllm/model_executor/model_loader/__init__.py +150 -0
  1069. vllm/model_executor/model_loader/base_loader.py +71 -0
  1070. vllm/model_executor/model_loader/bitsandbytes_loader.py +821 -0
  1071. vllm/model_executor/model_loader/default_loader.py +304 -0
  1072. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1073. vllm/model_executor/model_loader/gguf_loader.py +371 -0
  1074. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1075. vllm/model_executor/model_loader/runai_streamer_loader.py +115 -0
  1076. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1077. vllm/model_executor/model_loader/tensorizer.py +793 -0
  1078. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1079. vllm/model_executor/model_loader/utils.py +299 -0
  1080. vllm/model_executor/model_loader/weight_utils.py +1183 -0
  1081. vllm/model_executor/models/__init__.py +44 -0
  1082. vllm/model_executor/models/adapters.py +592 -0
  1083. vllm/model_executor/models/afmoe.py +697 -0
  1084. vllm/model_executor/models/aimv2.py +248 -0
  1085. vllm/model_executor/models/apertus.py +567 -0
  1086. vllm/model_executor/models/arcee.py +428 -0
  1087. vllm/model_executor/models/arctic.py +633 -0
  1088. vllm/model_executor/models/aria.py +663 -0
  1089. vllm/model_executor/models/audioflamingo3.py +639 -0
  1090. vllm/model_executor/models/aya_vision.py +448 -0
  1091. vllm/model_executor/models/bagel.py +591 -0
  1092. vllm/model_executor/models/baichuan.py +493 -0
  1093. vllm/model_executor/models/bailing_moe.py +643 -0
  1094. vllm/model_executor/models/bamba.py +511 -0
  1095. vllm/model_executor/models/bee.py +157 -0
  1096. vllm/model_executor/models/bert.py +911 -0
  1097. vllm/model_executor/models/bert_with_rope.py +729 -0
  1098. vllm/model_executor/models/blip.py +350 -0
  1099. vllm/model_executor/models/blip2.py +736 -0
  1100. vllm/model_executor/models/bloom.py +390 -0
  1101. vllm/model_executor/models/chameleon.py +1095 -0
  1102. vllm/model_executor/models/chatglm.py +502 -0
  1103. vllm/model_executor/models/clip.py +1045 -0
  1104. vllm/model_executor/models/cohere2_vision.py +470 -0
  1105. vllm/model_executor/models/commandr.py +469 -0
  1106. vllm/model_executor/models/config.py +571 -0
  1107. vllm/model_executor/models/dbrx.py +484 -0
  1108. vllm/model_executor/models/deepencoder.py +679 -0
  1109. vllm/model_executor/models/deepseek_eagle.py +253 -0
  1110. vllm/model_executor/models/deepseek_mtp.py +447 -0
  1111. vllm/model_executor/models/deepseek_ocr.py +601 -0
  1112. vllm/model_executor/models/deepseek_v2.py +1727 -0
  1113. vllm/model_executor/models/deepseek_vl2.py +642 -0
  1114. vllm/model_executor/models/dots1.py +566 -0
  1115. vllm/model_executor/models/dots_ocr.py +830 -0
  1116. vllm/model_executor/models/ernie45.py +53 -0
  1117. vllm/model_executor/models/ernie45_moe.py +755 -0
  1118. vllm/model_executor/models/ernie45_vl.py +1702 -0
  1119. vllm/model_executor/models/ernie45_vl_moe.py +801 -0
  1120. vllm/model_executor/models/ernie_mtp.py +278 -0
  1121. vllm/model_executor/models/exaone.py +524 -0
  1122. vllm/model_executor/models/exaone4.py +518 -0
  1123. vllm/model_executor/models/exaone_moe.py +579 -0
  1124. vllm/model_executor/models/exaone_moe_mtp.py +255 -0
  1125. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1126. vllm/model_executor/models/falcon.py +543 -0
  1127. vllm/model_executor/models/falcon_h1.py +675 -0
  1128. vllm/model_executor/models/flex_olmo.py +155 -0
  1129. vllm/model_executor/models/fuyu.py +371 -0
  1130. vllm/model_executor/models/gemma.py +425 -0
  1131. vllm/model_executor/models/gemma2.py +435 -0
  1132. vllm/model_executor/models/gemma3.py +520 -0
  1133. vllm/model_executor/models/gemma3_mm.py +664 -0
  1134. vllm/model_executor/models/gemma3n.py +1166 -0
  1135. vllm/model_executor/models/gemma3n_audio_utils.py +57 -0
  1136. vllm/model_executor/models/gemma3n_mm.py +820 -0
  1137. vllm/model_executor/models/glm.py +24 -0
  1138. vllm/model_executor/models/glm4.py +295 -0
  1139. vllm/model_executor/models/glm4_1v.py +1823 -0
  1140. vllm/model_executor/models/glm4_moe.py +725 -0
  1141. vllm/model_executor/models/glm4_moe_mtp.py +365 -0
  1142. vllm/model_executor/models/glm4v.py +783 -0
  1143. vllm/model_executor/models/glmasr.py +1154 -0
  1144. vllm/model_executor/models/glmasr_utils.py +188 -0
  1145. vllm/model_executor/models/gpt2.py +385 -0
  1146. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1147. vllm/model_executor/models/gpt_j.py +346 -0
  1148. vllm/model_executor/models/gpt_neox.py +340 -0
  1149. vllm/model_executor/models/gpt_oss.py +745 -0
  1150. vllm/model_executor/models/granite.py +475 -0
  1151. vllm/model_executor/models/granite_speech.py +919 -0
  1152. vllm/model_executor/models/granitemoe.py +561 -0
  1153. vllm/model_executor/models/granitemoehybrid.py +703 -0
  1154. vllm/model_executor/models/granitemoeshared.py +328 -0
  1155. vllm/model_executor/models/gritlm.py +242 -0
  1156. vllm/model_executor/models/grok1.py +803 -0
  1157. vllm/model_executor/models/h2ovl.py +554 -0
  1158. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1159. vllm/model_executor/models/hunyuan_vision.py +1034 -0
  1160. vllm/model_executor/models/hyperclovax_vision.py +1163 -0
  1161. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1162. vllm/model_executor/models/idefics3.py +734 -0
  1163. vllm/model_executor/models/interfaces.py +1180 -0
  1164. vllm/model_executor/models/interfaces_base.py +252 -0
  1165. vllm/model_executor/models/intern_vit.py +454 -0
  1166. vllm/model_executor/models/internlm2.py +451 -0
  1167. vllm/model_executor/models/internlm2_ve.py +139 -0
  1168. vllm/model_executor/models/interns1.py +828 -0
  1169. vllm/model_executor/models/interns1_vit.py +433 -0
  1170. vllm/model_executor/models/internvl.py +1436 -0
  1171. vllm/model_executor/models/iquest_loopcoder.py +595 -0
  1172. vllm/model_executor/models/isaac.py +1503 -0
  1173. vllm/model_executor/models/jais.py +397 -0
  1174. vllm/model_executor/models/jais2.py +508 -0
  1175. vllm/model_executor/models/jamba.py +599 -0
  1176. vllm/model_executor/models/jina_vl.py +145 -0
  1177. vllm/model_executor/models/kanana_v.py +756 -0
  1178. vllm/model_executor/models/keye.py +1709 -0
  1179. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1180. vllm/model_executor/models/kimi_linear.py +659 -0
  1181. vllm/model_executor/models/kimi_vl.py +577 -0
  1182. vllm/model_executor/models/lfm2.py +515 -0
  1183. vllm/model_executor/models/lfm2_moe.py +746 -0
  1184. vllm/model_executor/models/lfm2_vl.py +732 -0
  1185. vllm/model_executor/models/lightonocr.py +197 -0
  1186. vllm/model_executor/models/llama.py +724 -0
  1187. vllm/model_executor/models/llama4.py +860 -0
  1188. vllm/model_executor/models/llama4_eagle.py +225 -0
  1189. vllm/model_executor/models/llama_eagle.py +213 -0
  1190. vllm/model_executor/models/llama_eagle3.py +375 -0
  1191. vllm/model_executor/models/llava.py +879 -0
  1192. vllm/model_executor/models/llava_next.py +583 -0
  1193. vllm/model_executor/models/llava_next_video.py +467 -0
  1194. vllm/model_executor/models/llava_onevision.py +922 -0
  1195. vllm/model_executor/models/longcat_flash.py +767 -0
  1196. vllm/model_executor/models/longcat_flash_mtp.py +348 -0
  1197. vllm/model_executor/models/mamba.py +276 -0
  1198. vllm/model_executor/models/mamba2.py +288 -0
  1199. vllm/model_executor/models/medusa.py +179 -0
  1200. vllm/model_executor/models/midashenglm.py +826 -0
  1201. vllm/model_executor/models/mimo.py +188 -0
  1202. vllm/model_executor/models/mimo_mtp.py +294 -0
  1203. vllm/model_executor/models/mimo_v2_flash.py +718 -0
  1204. vllm/model_executor/models/minicpm.py +660 -0
  1205. vllm/model_executor/models/minicpm3.py +233 -0
  1206. vllm/model_executor/models/minicpm_eagle.py +386 -0
  1207. vllm/model_executor/models/minicpmo.py +768 -0
  1208. vllm/model_executor/models/minicpmv.py +1742 -0
  1209. vllm/model_executor/models/minimax_m2.py +552 -0
  1210. vllm/model_executor/models/minimax_text_01.py +1008 -0
  1211. vllm/model_executor/models/minimax_vl_01.py +395 -0
  1212. vllm/model_executor/models/mistral3.py +638 -0
  1213. vllm/model_executor/models/mistral_large_3.py +63 -0
  1214. vllm/model_executor/models/mistral_large_3_eagle.py +137 -0
  1215. vllm/model_executor/models/mixtral.py +599 -0
  1216. vllm/model_executor/models/mllama4.py +1170 -0
  1217. vllm/model_executor/models/mlp_speculator.py +235 -0
  1218. vllm/model_executor/models/modernbert.py +458 -0
  1219. vllm/model_executor/models/module_mapping.py +74 -0
  1220. vllm/model_executor/models/molmo.py +1592 -0
  1221. vllm/model_executor/models/moonvit.py +601 -0
  1222. vllm/model_executor/models/mpt.py +335 -0
  1223. vllm/model_executor/models/nano_nemotron_vl.py +1725 -0
  1224. vllm/model_executor/models/nemotron.py +499 -0
  1225. vllm/model_executor/models/nemotron_h.py +902 -0
  1226. vllm/model_executor/models/nemotron_nas.py +474 -0
  1227. vllm/model_executor/models/nemotron_parse.py +958 -0
  1228. vllm/model_executor/models/nemotron_vl.py +651 -0
  1229. vllm/model_executor/models/nvlm_d.py +216 -0
  1230. vllm/model_executor/models/olmo.py +412 -0
  1231. vllm/model_executor/models/olmo2.py +454 -0
  1232. vllm/model_executor/models/olmoe.py +498 -0
  1233. vllm/model_executor/models/opencua.py +262 -0
  1234. vllm/model_executor/models/openpangu.py +1378 -0
  1235. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1236. vllm/model_executor/models/opt.py +426 -0
  1237. vllm/model_executor/models/orion.py +365 -0
  1238. vllm/model_executor/models/ouro.py +507 -0
  1239. vllm/model_executor/models/ovis.py +557 -0
  1240. vllm/model_executor/models/ovis2_5.py +661 -0
  1241. vllm/model_executor/models/paddleocr_vl.py +1261 -0
  1242. vllm/model_executor/models/paligemma.py +429 -0
  1243. vllm/model_executor/models/persimmon.py +373 -0
  1244. vllm/model_executor/models/phi.py +363 -0
  1245. vllm/model_executor/models/phi3.py +18 -0
  1246. vllm/model_executor/models/phi3v.py +729 -0
  1247. vllm/model_executor/models/phi4mm.py +1250 -0
  1248. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1249. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1250. vllm/model_executor/models/phimoe.py +671 -0
  1251. vllm/model_executor/models/pixtral.py +1437 -0
  1252. vllm/model_executor/models/plamo2.py +993 -0
  1253. vllm/model_executor/models/plamo3.py +437 -0
  1254. vllm/model_executor/models/qwen.py +377 -0
  1255. vllm/model_executor/models/qwen2.py +600 -0
  1256. vllm/model_executor/models/qwen2_5_omni_thinker.py +1200 -0
  1257. vllm/model_executor/models/qwen2_5_vl.py +1598 -0
  1258. vllm/model_executor/models/qwen2_audio.py +478 -0
  1259. vllm/model_executor/models/qwen2_moe.py +604 -0
  1260. vllm/model_executor/models/qwen2_rm.py +120 -0
  1261. vllm/model_executor/models/qwen2_vl.py +1588 -0
  1262. vllm/model_executor/models/qwen3.py +331 -0
  1263. vllm/model_executor/models/qwen3_moe.py +752 -0
  1264. vllm/model_executor/models/qwen3_next.py +1410 -0
  1265. vllm/model_executor/models/qwen3_next_mtp.py +293 -0
  1266. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1814 -0
  1267. vllm/model_executor/models/qwen3_vl.py +2120 -0
  1268. vllm/model_executor/models/qwen3_vl_moe.py +474 -0
  1269. vllm/model_executor/models/qwen_vl.py +821 -0
  1270. vllm/model_executor/models/radio.py +573 -0
  1271. vllm/model_executor/models/registry.py +1218 -0
  1272. vllm/model_executor/models/roberta.py +239 -0
  1273. vllm/model_executor/models/rvl.py +107 -0
  1274. vllm/model_executor/models/seed_oss.py +492 -0
  1275. vllm/model_executor/models/siglip.py +1259 -0
  1276. vllm/model_executor/models/siglip2.py +495 -0
  1277. vllm/model_executor/models/siglip2navit.py +660 -0
  1278. vllm/model_executor/models/skyworkr1v.py +951 -0
  1279. vllm/model_executor/models/smolvlm.py +38 -0
  1280. vllm/model_executor/models/solar.py +484 -0
  1281. vllm/model_executor/models/stablelm.py +354 -0
  1282. vllm/model_executor/models/starcoder2.py +365 -0
  1283. vllm/model_executor/models/step3_text.py +554 -0
  1284. vllm/model_executor/models/step3_vl.py +1147 -0
  1285. vllm/model_executor/models/swin.py +500 -0
  1286. vllm/model_executor/models/tarsier.py +624 -0
  1287. vllm/model_executor/models/telechat2.py +153 -0
  1288. vllm/model_executor/models/teleflm.py +78 -0
  1289. vllm/model_executor/models/terratorch.py +318 -0
  1290. vllm/model_executor/models/transformers/__init__.py +127 -0
  1291. vllm/model_executor/models/transformers/base.py +523 -0
  1292. vllm/model_executor/models/transformers/causal.py +65 -0
  1293. vllm/model_executor/models/transformers/legacy.py +90 -0
  1294. vllm/model_executor/models/transformers/moe.py +329 -0
  1295. vllm/model_executor/models/transformers/multimodal.py +441 -0
  1296. vllm/model_executor/models/transformers/pooling.py +102 -0
  1297. vllm/model_executor/models/transformers/utils.py +253 -0
  1298. vllm/model_executor/models/ultravox.py +786 -0
  1299. vllm/model_executor/models/utils.py +832 -0
  1300. vllm/model_executor/models/vision.py +546 -0
  1301. vllm/model_executor/models/voxtral.py +867 -0
  1302. vllm/model_executor/models/voxtral_streaming.py +304 -0
  1303. vllm/model_executor/models/whisper.py +993 -0
  1304. vllm/model_executor/models/whisper_utils.py +299 -0
  1305. vllm/model_executor/models/zamba2.py +986 -0
  1306. vllm/model_executor/parameter.py +642 -0
  1307. vllm/model_executor/utils.py +113 -0
  1308. vllm/model_executor/warmup/__init__.py +0 -0
  1309. vllm/model_executor/warmup/deep_gemm_warmup.py +371 -0
  1310. vllm/model_executor/warmup/kernel_warmup.py +97 -0
  1311. vllm/model_inspection.py +136 -0
  1312. vllm/multimodal/__init__.py +38 -0
  1313. vllm/multimodal/audio.py +287 -0
  1314. vllm/multimodal/base.py +60 -0
  1315. vllm/multimodal/cache.py +829 -0
  1316. vllm/multimodal/evs.py +294 -0
  1317. vllm/multimodal/hasher.py +123 -0
  1318. vllm/multimodal/image.py +155 -0
  1319. vllm/multimodal/inputs.py +1027 -0
  1320. vllm/multimodal/parse.py +674 -0
  1321. vllm/multimodal/processing.py +2469 -0
  1322. vllm/multimodal/profiling.py +351 -0
  1323. vllm/multimodal/registry.py +375 -0
  1324. vllm/multimodal/utils.py +550 -0
  1325. vllm/multimodal/video.py +512 -0
  1326. vllm/outputs.py +347 -0
  1327. vllm/platforms/__init__.py +277 -0
  1328. vllm/platforms/cpu.py +423 -0
  1329. vllm/platforms/cuda.py +618 -0
  1330. vllm/platforms/interface.py +707 -0
  1331. vllm/platforms/rocm.py +586 -0
  1332. vllm/platforms/tpu.py +20 -0
  1333. vllm/platforms/xpu.py +262 -0
  1334. vllm/plugins/__init__.py +81 -0
  1335. vllm/plugins/io_processors/__init__.py +68 -0
  1336. vllm/plugins/io_processors/interface.py +77 -0
  1337. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1338. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1339. vllm/pooling_params.py +229 -0
  1340. vllm/profiler/__init__.py +0 -0
  1341. vllm/profiler/layerwise_profile.py +392 -0
  1342. vllm/profiler/utils.py +151 -0
  1343. vllm/profiler/wrapper.py +241 -0
  1344. vllm/py.typed +2 -0
  1345. vllm/ray/__init__.py +0 -0
  1346. vllm/ray/lazy_utils.py +30 -0
  1347. vllm/ray/ray_env.py +79 -0
  1348. vllm/reasoning/__init__.py +96 -0
  1349. vllm/reasoning/abs_reasoning_parsers.py +318 -0
  1350. vllm/reasoning/basic_parsers.py +175 -0
  1351. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1352. vllm/reasoning/deepseek_v3_reasoning_parser.py +69 -0
  1353. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1354. vllm/reasoning/glm4_moe_reasoning_parser.py +13 -0
  1355. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1356. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1357. vllm/reasoning/holo2_reasoning_parser.py +89 -0
  1358. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1359. vllm/reasoning/identity_reasoning_parser.py +63 -0
  1360. vllm/reasoning/minimax_m2_reasoning_parser.py +110 -0
  1361. vllm/reasoning/mistral_reasoning_parser.py +154 -0
  1362. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1363. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1364. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1365. vllm/reasoning/step3_reasoning_parser.py +113 -0
  1366. vllm/sampling_params.py +629 -0
  1367. vllm/scalar_type.py +355 -0
  1368. vllm/scripts.py +17 -0
  1369. vllm/sequence.py +64 -0
  1370. vllm/tasks.py +13 -0
  1371. vllm/third_party/__init__.py +0 -0
  1372. vllm/third_party/pynvml.py +6140 -0
  1373. vllm/tokenizers/__init__.py +18 -0
  1374. vllm/tokenizers/deepseek_v32.py +187 -0
  1375. vllm/tokenizers/deepseek_v32_encoding.py +463 -0
  1376. vllm/tokenizers/detokenizer_utils.py +198 -0
  1377. vllm/tokenizers/grok2.py +443 -0
  1378. vllm/tokenizers/hf.py +119 -0
  1379. vllm/tokenizers/mistral.py +543 -0
  1380. vllm/tokenizers/protocol.py +123 -0
  1381. vllm/tokenizers/registry.py +238 -0
  1382. vllm/tool_parsers/__init__.py +158 -0
  1383. vllm/tool_parsers/abstract_tool_parser.py +274 -0
  1384. vllm/tool_parsers/deepseekv31_tool_parser.py +388 -0
  1385. vllm/tool_parsers/deepseekv32_tool_parser.py +591 -0
  1386. vllm/tool_parsers/deepseekv3_tool_parser.py +390 -0
  1387. vllm/tool_parsers/ernie45_tool_parser.py +210 -0
  1388. vllm/tool_parsers/functiongemma_tool_parser.py +321 -0
  1389. vllm/tool_parsers/gigachat3_tool_parser.py +190 -0
  1390. vllm/tool_parsers/glm47_moe_tool_parser.py +23 -0
  1391. vllm/tool_parsers/glm4_moe_tool_parser.py +215 -0
  1392. vllm/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  1393. vllm/tool_parsers/granite_tool_parser.py +253 -0
  1394. vllm/tool_parsers/hermes_tool_parser.py +495 -0
  1395. vllm/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  1396. vllm/tool_parsers/internlm2_tool_parser.py +227 -0
  1397. vllm/tool_parsers/jamba_tool_parser.py +323 -0
  1398. vllm/tool_parsers/kimi_k2_tool_parser.py +598 -0
  1399. vllm/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  1400. vllm/tool_parsers/llama_tool_parser.py +324 -0
  1401. vllm/tool_parsers/longcat_tool_parser.py +37 -0
  1402. vllm/tool_parsers/minimax_m2_tool_parser.py +776 -0
  1403. vllm/tool_parsers/minimax_tool_parser.py +849 -0
  1404. vllm/tool_parsers/mistral_tool_parser.py +612 -0
  1405. vllm/tool_parsers/olmo3_tool_parser.py +366 -0
  1406. vllm/tool_parsers/openai_tool_parser.py +111 -0
  1407. vllm/tool_parsers/phi4mini_tool_parser.py +120 -0
  1408. vllm/tool_parsers/pythonic_tool_parser.py +332 -0
  1409. vllm/tool_parsers/qwen3coder_tool_parser.py +781 -0
  1410. vllm/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  1411. vllm/tool_parsers/seed_oss_tool_parser.py +744 -0
  1412. vllm/tool_parsers/step3_tool_parser.py +303 -0
  1413. vllm/tool_parsers/utils.py +229 -0
  1414. vllm/tool_parsers/xlam_tool_parser.py +556 -0
  1415. vllm/tracing.py +135 -0
  1416. vllm/transformers_utils/__init__.py +26 -0
  1417. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1418. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1419. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1420. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1421. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1422. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1423. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1424. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1425. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1426. vllm/transformers_utils/config.py +1169 -0
  1427. vllm/transformers_utils/config_parser_base.py +20 -0
  1428. vllm/transformers_utils/configs/__init__.py +106 -0
  1429. vllm/transformers_utils/configs/afmoe.py +87 -0
  1430. vllm/transformers_utils/configs/arctic.py +216 -0
  1431. vllm/transformers_utils/configs/bagel.py +53 -0
  1432. vllm/transformers_utils/configs/chatglm.py +75 -0
  1433. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1434. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1435. vllm/transformers_utils/configs/eagle.py +90 -0
  1436. vllm/transformers_utils/configs/falcon.py +89 -0
  1437. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1438. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1439. vllm/transformers_utils/configs/isaac.py +100 -0
  1440. vllm/transformers_utils/configs/jais.py +243 -0
  1441. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1442. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1443. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1444. vllm/transformers_utils/configs/medusa.py +65 -0
  1445. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1446. vllm/transformers_utils/configs/mistral.py +263 -0
  1447. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1448. vllm/transformers_utils/configs/moonvit.py +33 -0
  1449. vllm/transformers_utils/configs/nemotron.py +220 -0
  1450. vllm/transformers_utils/configs/nemotron_h.py +284 -0
  1451. vllm/transformers_utils/configs/olmo3.py +83 -0
  1452. vllm/transformers_utils/configs/ovis.py +182 -0
  1453. vllm/transformers_utils/configs/qwen3_next.py +277 -0
  1454. vllm/transformers_utils/configs/radio.py +98 -0
  1455. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1456. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1457. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1458. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1459. vllm/transformers_utils/configs/tarsier2.py +24 -0
  1460. vllm/transformers_utils/configs/ultravox.py +120 -0
  1461. vllm/transformers_utils/dynamic_module.py +70 -0
  1462. vllm/transformers_utils/gguf_utils.py +280 -0
  1463. vllm/transformers_utils/model_arch_config_convertor.py +402 -0
  1464. vllm/transformers_utils/processor.py +424 -0
  1465. vllm/transformers_utils/processors/__init__.py +25 -0
  1466. vllm/transformers_utils/processors/bagel.py +78 -0
  1467. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1468. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1469. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1470. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1471. vllm/transformers_utils/processors/ovis.py +453 -0
  1472. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1473. vllm/transformers_utils/repo_utils.py +287 -0
  1474. vllm/transformers_utils/runai_utils.py +102 -0
  1475. vllm/transformers_utils/s3_utils.py +95 -0
  1476. vllm/transformers_utils/tokenizer.py +19 -0
  1477. vllm/transformers_utils/utils.py +112 -0
  1478. vllm/triton_utils/__init__.py +20 -0
  1479. vllm/triton_utils/importing.py +103 -0
  1480. vllm/usage/__init__.py +0 -0
  1481. vllm/usage/usage_lib.py +278 -0
  1482. vllm/utils/__init__.py +36 -0
  1483. vllm/utils/argparse_utils.py +491 -0
  1484. vllm/utils/async_utils.py +310 -0
  1485. vllm/utils/cache.py +214 -0
  1486. vllm/utils/collection_utils.py +112 -0
  1487. vllm/utils/counter.py +45 -0
  1488. vllm/utils/deep_gemm.py +424 -0
  1489. vllm/utils/flashinfer.py +602 -0
  1490. vllm/utils/func_utils.py +236 -0
  1491. vllm/utils/gc_utils.py +151 -0
  1492. vllm/utils/hashing.py +117 -0
  1493. vllm/utils/import_utils.py +438 -0
  1494. vllm/utils/jsontree.py +158 -0
  1495. vllm/utils/math_utils.py +32 -0
  1496. vllm/utils/mem_constants.py +13 -0
  1497. vllm/utils/mem_utils.py +285 -0
  1498. vllm/utils/nccl.py +64 -0
  1499. vllm/utils/network_utils.py +331 -0
  1500. vllm/utils/nvtx_pytorch_hooks.py +286 -0
  1501. vllm/utils/platform_utils.py +59 -0
  1502. vllm/utils/profiling.py +56 -0
  1503. vllm/utils/registry.py +51 -0
  1504. vllm/utils/serial_utils.py +214 -0
  1505. vllm/utils/system_utils.py +296 -0
  1506. vllm/utils/tensor_schema.py +255 -0
  1507. vllm/utils/torch_utils.py +781 -0
  1508. vllm/v1/__init__.py +0 -0
  1509. vllm/v1/attention/__init__.py +0 -0
  1510. vllm/v1/attention/backend.py +736 -0
  1511. vllm/v1/attention/backends/__init__.py +0 -0
  1512. vllm/v1/attention/backends/cpu_attn.py +501 -0
  1513. vllm/v1/attention/backends/fa_utils.py +126 -0
  1514. vllm/v1/attention/backends/flash_attn.py +1092 -0
  1515. vllm/v1/attention/backends/flash_attn_diffkv.py +277 -0
  1516. vllm/v1/attention/backends/flashinfer.py +1713 -0
  1517. vllm/v1/attention/backends/flex_attention.py +1024 -0
  1518. vllm/v1/attention/backends/gdn_attn.py +382 -0
  1519. vllm/v1/attention/backends/linear_attn.py +77 -0
  1520. vllm/v1/attention/backends/mamba1_attn.py +28 -0
  1521. vllm/v1/attention/backends/mamba2_attn.py +256 -0
  1522. vllm/v1/attention/backends/mamba_attn.py +313 -0
  1523. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1524. vllm/v1/attention/backends/mla/aiter_triton_mla.py +66 -0
  1525. vllm/v1/attention/backends/mla/common.py +2156 -0
  1526. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1527. vllm/v1/attention/backends/mla/flashattn_mla.py +348 -0
  1528. vllm/v1/attention/backends/mla/flashinfer_mla.py +175 -0
  1529. vllm/v1/attention/backends/mla/flashmla.py +321 -0
  1530. vllm/v1/attention/backends/mla/flashmla_sparse.py +1021 -0
  1531. vllm/v1/attention/backends/mla/indexer.py +345 -0
  1532. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +284 -0
  1533. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +321 -0
  1534. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1535. vllm/v1/attention/backends/registry.py +258 -0
  1536. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1537. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1538. vllm/v1/attention/backends/rocm_attn.py +405 -0
  1539. vllm/v1/attention/backends/short_conv_attn.py +26 -0
  1540. vllm/v1/attention/backends/tree_attn.py +430 -0
  1541. vllm/v1/attention/backends/triton_attn.py +578 -0
  1542. vllm/v1/attention/backends/utils.py +978 -0
  1543. vllm/v1/attention/ops/__init__.py +0 -0
  1544. vllm/v1/attention/ops/chunked_prefill_paged_decode.py +459 -0
  1545. vllm/v1/attention/ops/common.py +469 -0
  1546. vllm/v1/attention/ops/flashmla.py +254 -0
  1547. vllm/v1/attention/ops/merge_attn_states.py +47 -0
  1548. vllm/v1/attention/ops/paged_attn.py +51 -0
  1549. vllm/v1/attention/ops/pallas_kv_cache_update.py +130 -0
  1550. vllm/v1/attention/ops/prefix_prefill.py +862 -0
  1551. vllm/v1/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  1552. vllm/v1/attention/ops/triton_decode_attention.py +709 -0
  1553. vllm/v1/attention/ops/triton_merge_attn_states.py +116 -0
  1554. vllm/v1/attention/ops/triton_prefill_attention.py +272 -0
  1555. vllm/v1/attention/ops/triton_reshape_and_cache_flash.py +395 -0
  1556. vllm/v1/attention/ops/triton_unified_attention.py +1088 -0
  1557. vllm/v1/attention/ops/vit_attn_wrappers.py +185 -0
  1558. vllm/v1/attention/selector.py +145 -0
  1559. vllm/v1/core/__init__.py +0 -0
  1560. vllm/v1/core/block_pool.py +489 -0
  1561. vllm/v1/core/encoder_cache_manager.py +402 -0
  1562. vllm/v1/core/kv_cache_coordinator.py +560 -0
  1563. vllm/v1/core/kv_cache_manager.py +485 -0
  1564. vllm/v1/core/kv_cache_metrics.py +96 -0
  1565. vllm/v1/core/kv_cache_utils.py +1642 -0
  1566. vllm/v1/core/sched/__init__.py +0 -0
  1567. vllm/v1/core/sched/async_scheduler.py +66 -0
  1568. vllm/v1/core/sched/interface.py +205 -0
  1569. vllm/v1/core/sched/output.py +261 -0
  1570. vllm/v1/core/sched/request_queue.py +208 -0
  1571. vllm/v1/core/sched/scheduler.py +1936 -0
  1572. vllm/v1/core/sched/utils.py +64 -0
  1573. vllm/v1/core/single_type_kv_cache_manager.py +926 -0
  1574. vllm/v1/cudagraph_dispatcher.py +183 -0
  1575. vllm/v1/engine/__init__.py +224 -0
  1576. vllm/v1/engine/async_llm.py +874 -0
  1577. vllm/v1/engine/coordinator.py +396 -0
  1578. vllm/v1/engine/core.py +1614 -0
  1579. vllm/v1/engine/core_client.py +1422 -0
  1580. vllm/v1/engine/detokenizer.py +351 -0
  1581. vllm/v1/engine/exceptions.py +18 -0
  1582. vllm/v1/engine/input_processor.py +713 -0
  1583. vllm/v1/engine/llm_engine.py +415 -0
  1584. vllm/v1/engine/logprobs.py +245 -0
  1585. vllm/v1/engine/output_processor.py +715 -0
  1586. vllm/v1/engine/parallel_sampling.py +150 -0
  1587. vllm/v1/engine/utils.py +1086 -0
  1588. vllm/v1/executor/__init__.py +6 -0
  1589. vllm/v1/executor/abstract.py +352 -0
  1590. vllm/v1/executor/multiproc_executor.py +888 -0
  1591. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1592. vllm/v1/executor/ray_executor.py +623 -0
  1593. vllm/v1/executor/ray_utils.py +468 -0
  1594. vllm/v1/executor/uniproc_executor.py +186 -0
  1595. vllm/v1/kv_cache_interface.py +485 -0
  1596. vllm/v1/kv_offload/__init__.py +0 -0
  1597. vllm/v1/kv_offload/abstract.py +161 -0
  1598. vllm/v1/kv_offload/arc_manager.py +237 -0
  1599. vllm/v1/kv_offload/backend.py +97 -0
  1600. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1601. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1602. vllm/v1/kv_offload/cpu.py +109 -0
  1603. vllm/v1/kv_offload/factory.py +58 -0
  1604. vllm/v1/kv_offload/lru_manager.py +139 -0
  1605. vllm/v1/kv_offload/mediums.py +39 -0
  1606. vllm/v1/kv_offload/spec.py +70 -0
  1607. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1608. vllm/v1/kv_offload/worker/cpu_gpu.py +287 -0
  1609. vllm/v1/kv_offload/worker/worker.py +163 -0
  1610. vllm/v1/metrics/__init__.py +0 -0
  1611. vllm/v1/metrics/loggers.py +1320 -0
  1612. vllm/v1/metrics/perf.py +1244 -0
  1613. vllm/v1/metrics/prometheus.py +82 -0
  1614. vllm/v1/metrics/ray_wrappers.py +194 -0
  1615. vllm/v1/metrics/reader.py +257 -0
  1616. vllm/v1/metrics/stats.py +440 -0
  1617. vllm/v1/outputs.py +242 -0
  1618. vllm/v1/pool/__init__.py +0 -0
  1619. vllm/v1/pool/metadata.py +124 -0
  1620. vllm/v1/request.py +281 -0
  1621. vllm/v1/sample/__init__.py +0 -0
  1622. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1623. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1624. vllm/v1/sample/logits_processor/interface.py +106 -0
  1625. vllm/v1/sample/logits_processor/state.py +165 -0
  1626. vllm/v1/sample/metadata.py +44 -0
  1627. vllm/v1/sample/ops/__init__.py +0 -0
  1628. vllm/v1/sample/ops/bad_words.py +57 -0
  1629. vllm/v1/sample/ops/logprobs.py +25 -0
  1630. vllm/v1/sample/ops/penalties.py +57 -0
  1631. vllm/v1/sample/ops/topk_topp_sampler.py +388 -0
  1632. vllm/v1/sample/rejection_sampler.py +822 -0
  1633. vllm/v1/sample/sampler.py +319 -0
  1634. vllm/v1/sample/tpu/__init__.py +0 -0
  1635. vllm/v1/sample/tpu/metadata.py +120 -0
  1636. vllm/v1/sample/tpu/sampler.py +215 -0
  1637. vllm/v1/serial_utils.py +514 -0
  1638. vllm/v1/spec_decode/__init__.py +0 -0
  1639. vllm/v1/spec_decode/eagle.py +1346 -0
  1640. vllm/v1/spec_decode/medusa.py +73 -0
  1641. vllm/v1/spec_decode/metadata.py +66 -0
  1642. vllm/v1/spec_decode/metrics.py +225 -0
  1643. vllm/v1/spec_decode/ngram_proposer.py +281 -0
  1644. vllm/v1/spec_decode/suffix_decoding.py +95 -0
  1645. vllm/v1/spec_decode/utils.py +109 -0
  1646. vllm/v1/structured_output/__init__.py +337 -0
  1647. vllm/v1/structured_output/backend_guidance.py +291 -0
  1648. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1649. vllm/v1/structured_output/backend_outlines.py +324 -0
  1650. vllm/v1/structured_output/backend_types.py +136 -0
  1651. vllm/v1/structured_output/backend_xgrammar.py +378 -0
  1652. vllm/v1/structured_output/request.py +91 -0
  1653. vllm/v1/structured_output/utils.py +457 -0
  1654. vllm/v1/utils.py +466 -0
  1655. vllm/v1/worker/__init__.py +0 -0
  1656. vllm/v1/worker/block_table.py +343 -0
  1657. vllm/v1/worker/cp_utils.py +42 -0
  1658. vllm/v1/worker/cpu_model_runner.py +122 -0
  1659. vllm/v1/worker/cpu_worker.py +192 -0
  1660. vllm/v1/worker/dp_utils.py +240 -0
  1661. vllm/v1/worker/ec_connector_model_runner_mixin.py +85 -0
  1662. vllm/v1/worker/gpu/README.md +4 -0
  1663. vllm/v1/worker/gpu/__init__.py +0 -0
  1664. vllm/v1/worker/gpu/async_utils.py +98 -0
  1665. vllm/v1/worker/gpu/attn_utils.py +183 -0
  1666. vllm/v1/worker/gpu/block_table.py +222 -0
  1667. vllm/v1/worker/gpu/buffer_utils.py +224 -0
  1668. vllm/v1/worker/gpu/cudagraph_utils.py +264 -0
  1669. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1670. vllm/v1/worker/gpu/input_batch.py +526 -0
  1671. vllm/v1/worker/gpu/metrics/__init__.py +0 -0
  1672. vllm/v1/worker/gpu/metrics/logits.py +42 -0
  1673. vllm/v1/worker/gpu/mm/__init__.py +0 -0
  1674. vllm/v1/worker/gpu/mm/mrope_utils.py +127 -0
  1675. vllm/v1/worker/gpu/model_runner.py +1005 -0
  1676. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1677. vllm/v1/worker/gpu/sample/gumbel.py +106 -0
  1678. vllm/v1/worker/gpu/sample/logit_bias.py +270 -0
  1679. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1680. vllm/v1/worker/gpu/sample/metadata.py +79 -0
  1681. vllm/v1/worker/gpu/sample/min_p.py +58 -0
  1682. vllm/v1/worker/gpu/sample/output.py +14 -0
  1683. vllm/v1/worker/gpu/sample/penalties.py +155 -0
  1684. vllm/v1/worker/gpu/sample/sampler.py +88 -0
  1685. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1686. vllm/v1/worker/gpu/spec_decode/eagle.py +566 -0
  1687. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1688. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +71 -0
  1689. vllm/v1/worker/gpu/states.py +282 -0
  1690. vllm/v1/worker/gpu/structured_outputs.py +100 -0
  1691. vllm/v1/worker/gpu_input_batch.py +1030 -0
  1692. vllm/v1/worker/gpu_model_runner.py +5761 -0
  1693. vllm/v1/worker/gpu_ubatch_wrapper.py +475 -0
  1694. vllm/v1/worker/gpu_worker.py +968 -0
  1695. vllm/v1/worker/kv_connector_model_runner_mixin.py +300 -0
  1696. vllm/v1/worker/lora_model_runner_mixin.py +225 -0
  1697. vllm/v1/worker/tpu_input_batch.py +574 -0
  1698. vllm/v1/worker/tpu_worker.py +18 -0
  1699. vllm/v1/worker/ubatch_utils.py +112 -0
  1700. vllm/v1/worker/ubatching.py +242 -0
  1701. vllm/v1/worker/utils.py +400 -0
  1702. vllm/v1/worker/worker_base.py +372 -0
  1703. vllm/v1/worker/workspace.py +253 -0
  1704. vllm/v1/worker/xpu_model_runner.py +48 -0
  1705. vllm/v1/worker/xpu_worker.py +174 -0
  1706. vllm/version.py +39 -0
  1707. vllm/vllm_flash_attn/.gitkeep +0 -0
  1708. vllm_cpu_avx512bf16-0.14.0.dist-info/METADATA +348 -0
  1709. vllm_cpu_avx512bf16-0.14.0.dist-info/RECORD +1712 -0
  1710. vllm_cpu_avx512bf16-0.14.0.dist-info/WHEEL +5 -0
  1711. vllm_cpu_avx512bf16-0.14.0.dist-info/entry_points.txt +5 -0
  1712. vllm_cpu_avx512bf16-0.14.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1936 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ import itertools
4
+ import time
5
+ from collections import defaultdict
6
+ from collections.abc import Iterable
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+
11
+ from vllm import envs
12
+ from vllm.compilation.cuda_graph import CUDAGraphStat
13
+ from vllm.config import VllmConfig
14
+ from vllm.distributed.ec_transfer.ec_connector.base import (
15
+ ECConnectorMetadata,
16
+ ECConnectorRole,
17
+ )
18
+ from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory
19
+ from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
20
+ from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
21
+ from vllm.distributed.kv_transfer.kv_connector.v1 import (
22
+ KVConnectorBase_V1,
23
+ KVConnectorRole,
24
+ SupportsHMA,
25
+ )
26
+ from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
27
+ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
28
+ from vllm.logger import init_logger
29
+ from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
30
+ RoutedExpertsReader,
31
+ )
32
+ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
33
+ from vllm.v1.core.encoder_cache_manager import (
34
+ EncoderCacheManager,
35
+ EncoderDecoderCacheManager,
36
+ compute_encoder_budget,
37
+ )
38
+ from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
39
+ from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
40
+ from vllm.v1.core.sched.interface import SchedulerInterface
41
+ from vllm.v1.core.sched.output import (
42
+ CachedRequestData,
43
+ GrammarOutput,
44
+ NewRequestData,
45
+ SchedulerOutput,
46
+ )
47
+ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_queue
48
+ from vllm.v1.core.sched.utils import check_stop, remove_all
49
+ from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
50
+ from vllm.v1.kv_cache_interface import KVCacheConfig
51
+ from vllm.v1.metrics.perf import ModelMetrics, PerfStats
52
+ from vllm.v1.metrics.stats import (
53
+ PrefixCacheStats,
54
+ SchedulerStats,
55
+ )
56
+ from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
57
+ from vllm.v1.request import Request, RequestStatus
58
+ from vllm.v1.spec_decode.metrics import SpecDecodingStats
59
+ from vllm.v1.structured_output import StructuredOutputManager
60
+ from vllm.v1.utils import record_function_or_nullcontext
61
+
62
+ logger = init_logger(__name__)
63
+
64
+
65
+ class Scheduler(SchedulerInterface):
66
+ def __init__(
67
+ self,
68
+ vllm_config: VllmConfig,
69
+ kv_cache_config: KVCacheConfig,
70
+ structured_output_manager: StructuredOutputManager,
71
+ block_size: int,
72
+ mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
73
+ include_finished_set: bool = False,
74
+ log_stats: bool = False,
75
+ ) -> None:
76
+ self.vllm_config = vllm_config
77
+ self.scheduler_config = vllm_config.scheduler_config
78
+ self.cache_config = vllm_config.cache_config
79
+ self.lora_config = vllm_config.lora_config
80
+ self.kv_cache_config = kv_cache_config
81
+ self.kv_events_config = vllm_config.kv_events_config
82
+ self.parallel_config = vllm_config.parallel_config
83
+ self.log_stats = log_stats
84
+ self.observability_config = vllm_config.observability_config
85
+ self.kv_metrics_collector: KVCacheMetricsCollector | None = None
86
+ if self.observability_config.kv_cache_metrics:
87
+ self.kv_metrics_collector = KVCacheMetricsCollector(
88
+ self.observability_config.kv_cache_metrics_sample,
89
+ )
90
+ self.structured_output_manager = structured_output_manager
91
+ self.is_encoder_decoder = vllm_config.model_config.is_encoder_decoder
92
+
93
+ # include_finished_set controls whether a separate set of finished
94
+ # request ids should be included in the EngineCoreOutputs returned
95
+ # by update_from_outputs(). This is currently used in the multi-engine
96
+ # case to track request lifetimes efficiently.
97
+ self.finished_req_ids_dict: dict[int, set[str]] | None = (
98
+ defaultdict(set) if include_finished_set else None
99
+ )
100
+ self.prev_step_scheduled_req_ids: set[str] = set()
101
+
102
+ # Scheduling constraints.
103
+ self.max_num_running_reqs = self.scheduler_config.max_num_seqs
104
+ self.max_num_scheduled_tokens = self.scheduler_config.max_num_batched_tokens
105
+ self.max_model_len = vllm_config.model_config.max_model_len
106
+ self.enable_kv_cache_events = (
107
+ self.kv_events_config is not None
108
+ and self.kv_events_config.enable_kv_cache_events
109
+ )
110
+
111
+ # Create KVConnector for the Scheduler. Note that each Worker
112
+ # will have a corresponding KVConnector with Role=WORKER.
113
+ # KV Connector pushes/pull of remote KVs for P/D and offloading.
114
+ self.connector = None
115
+ self.connector_prefix_cache_stats: PrefixCacheStats | None = None
116
+ self.recompute_kv_load_failures = True
117
+ if self.vllm_config.kv_transfer_config is not None:
118
+ assert not self.is_encoder_decoder, (
119
+ "Encoder-decoder models are not currently supported with KV connectors"
120
+ )
121
+ self.connector = KVConnectorFactory.create_connector(
122
+ config=self.vllm_config,
123
+ role=KVConnectorRole.SCHEDULER,
124
+ kv_cache_config=self.kv_cache_config,
125
+ )
126
+ if self.log_stats:
127
+ self.connector_prefix_cache_stats = PrefixCacheStats()
128
+ kv_load_failure_policy = (
129
+ self.vllm_config.kv_transfer_config.kv_load_failure_policy
130
+ )
131
+ self.recompute_kv_load_failures = kv_load_failure_policy == "recompute"
132
+
133
+ self.kv_event_publisher = EventPublisherFactory.create(
134
+ self.kv_events_config,
135
+ self.parallel_config.data_parallel_index,
136
+ )
137
+ self.ec_connector = None
138
+ if self.vllm_config.ec_transfer_config is not None:
139
+ self.ec_connector = ECConnectorFactory.create_connector(
140
+ config=self.vllm_config, role=ECConnectorRole.SCHEDULER
141
+ )
142
+
143
+ num_gpu_blocks = self.cache_config.num_gpu_blocks
144
+ assert num_gpu_blocks is not None and num_gpu_blocks > 0
145
+
146
+ self.block_size = block_size
147
+ self.dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
148
+ self.pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size
149
+
150
+ # req_id -> Request
151
+ self.requests: dict[str, Request] = {}
152
+ # Scheduling policy
153
+ try:
154
+ self.policy = SchedulingPolicy(self.scheduler_config.policy)
155
+ except ValueError as e:
156
+ raise ValueError(
157
+ f"Unknown scheduling policy: {self.scheduler_config.policy}"
158
+ ) from e
159
+ # Priority queues for requests.
160
+ self.waiting = create_request_queue(self.policy)
161
+ self.running: list[Request] = []
162
+
163
+ # The request IDs that are finished in between the previous and the
164
+ # current steps. This is used to notify the workers about the finished
165
+ # requests so that they can free the cached states for those requests.
166
+ # This is flushed at the end of each scheduling step.
167
+ self.finished_req_ids: set[str] = set()
168
+
169
+ # KV Connector: requests in process of async KV loading or recving
170
+ self.finished_recving_kv_req_ids: set[str] = set()
171
+ self.failed_recving_kv_req_ids: set[str] = set()
172
+
173
+ # Encoder-related.
174
+ # Calculate encoder cache size if applicable
175
+ # NOTE: For now we use the same budget for both compute and space.
176
+ # This can be changed when we make encoder cache for embedding caching
177
+ # across requests.
178
+ encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
179
+ model_config=vllm_config.model_config,
180
+ scheduler_config=vllm_config.scheduler_config,
181
+ mm_registry=mm_registry,
182
+ )
183
+
184
+ # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
185
+ # projector if needed) for MM models as well as encoder-decoder
186
+ # transformers.
187
+ self.max_num_encoder_input_tokens = encoder_compute_budget
188
+ # NOTE: For the models without encoder (e.g., text-only models),
189
+ # the encoder cache will not be initialized because cache size is 0
190
+ # for these models.
191
+ self.encoder_cache_manager = (
192
+ EncoderDecoderCacheManager(cache_size=encoder_cache_size)
193
+ if self.is_encoder_decoder
194
+ else EncoderCacheManager(cache_size=encoder_cache_size)
195
+ )
196
+ # For encoder-decoder models, allocate the maximum number of tokens for Cross
197
+ # Attn blocks, as for Whisper its input is always padded to the maximum length.
198
+ # TODO (NickLucche): Generalize to models with variable-length encoder inputs.
199
+ self._num_encoder_max_input_tokens = (
200
+ MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(vllm_config.model_config)
201
+ )
202
+
203
+ speculative_config = vllm_config.speculative_config
204
+ self.use_eagle = False
205
+ self.num_spec_tokens = self.num_lookahead_tokens = 0
206
+ if speculative_config:
207
+ self.num_spec_tokens = speculative_config.num_speculative_tokens
208
+ if speculative_config.use_eagle():
209
+ self.use_eagle = True
210
+ self.num_lookahead_tokens = self.num_spec_tokens
211
+
212
+ # Create the KV cache manager.
213
+ self.kv_cache_manager = KVCacheManager(
214
+ kv_cache_config=kv_cache_config,
215
+ max_model_len=self.max_model_len,
216
+ enable_caching=self.cache_config.enable_prefix_caching,
217
+ use_eagle=self.use_eagle,
218
+ log_stats=self.log_stats,
219
+ enable_kv_cache_events=self.enable_kv_cache_events,
220
+ dcp_world_size=self.dcp_world_size,
221
+ pcp_world_size=self.pcp_world_size,
222
+ hash_block_size=self.block_size,
223
+ metrics_collector=self.kv_metrics_collector,
224
+ )
225
+ self.use_pp = self.parallel_config.pipeline_parallel_size > 1
226
+ self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
227
+ self.perf_metrics: ModelMetrics | None = None
228
+ if self.log_stats and vllm_config.observability_config.enable_mfu_metrics:
229
+ self.perf_metrics = ModelMetrics(vllm_config)
230
+
231
+ if self.vllm_config.model_config.enable_return_routed_experts:
232
+ assert self.dcp_world_size == 1 and self.pcp_world_size == 1, (
233
+ "enable_return_routed_experts does not support context parallelism "
234
+ "(dcp_world_size > 1 or pcp_world_size > 1)"
235
+ )
236
+
237
+ self.routed_experts_reader = RoutedExpertsReader.create()
238
+
239
+ assert len(kv_cache_config.kv_cache_groups) > 0, (
240
+ "enable_return_routed_experts requires at least one kv cache group"
241
+ )
242
+ self.max_num_kv_tokens = (
243
+ kv_cache_config.num_blocks // len(kv_cache_config.kv_cache_groups) + 1
244
+ ) * self.block_size
245
+
246
+ self.routed_experts_reader.attach_buffer(
247
+ max_num_kv_tokens=self.max_num_kv_tokens,
248
+ model_config=self.vllm_config.model_config,
249
+ instance_id=self.vllm_config.instance_id,
250
+ )
251
+
252
+ def schedule(self) -> SchedulerOutput:
253
+ # NOTE(woosuk) on the scheduling algorithm:
254
+ # There's no "decoding phase" nor "prefill phase" in the scheduler.
255
+ # Each request just has the num_computed_tokens and
256
+ # num_tokens_with_spec. num_tokens_with_spec =
257
+ # len(prompt_token_ids) + len(output_token_ids) + len(spec_token_ids).
258
+ # At each step, the scheduler tries to assign tokens to the requests
259
+ # so that each request's num_computed_tokens can catch up its
260
+ # num_tokens_with_spec. This is general enough to cover
261
+ # chunked prefills, prefix caching, speculative decoding,
262
+ # and the "jump decoding" optimization in the future.
263
+
264
+ scheduled_new_reqs: list[Request] = []
265
+ scheduled_resumed_reqs: list[Request] = []
266
+ scheduled_running_reqs: list[Request] = []
267
+ preempted_reqs: list[Request] = []
268
+
269
+ req_to_new_blocks: dict[str, KVCacheBlocks] = {}
270
+ num_scheduled_tokens: dict[str, int] = {}
271
+ token_budget = self.max_num_scheduled_tokens
272
+ # Encoder-related.
273
+ scheduled_encoder_inputs: dict[str, list[int]] = {}
274
+ encoder_compute_budget = self.max_num_encoder_input_tokens
275
+ # Spec decode-related.
276
+ scheduled_spec_decode_tokens: dict[str, list[int]] = {}
277
+
278
+ # For logging.
279
+ scheduled_timestamp = time.monotonic()
280
+
281
+ # First, schedule the RUNNING requests.
282
+ req_index = 0
283
+ while req_index < len(self.running) and token_budget > 0:
284
+ request = self.running[req_index]
285
+
286
+ if (
287
+ request.num_output_placeholders > 0
288
+ # This is (num_computed_tokens + 1) - (num_output_placeholders - 1).
289
+ # Since output placeholders are also included in the computed tokens
290
+ # count, we subtract (num_output_placeholders - 1) to remove any draft
291
+ # tokens, so that we can be sure no further steps are needed even if
292
+ # they are all rejected.
293
+ and request.num_computed_tokens + 2 - request.num_output_placeholders
294
+ >= request.num_prompt_tokens + request.max_tokens
295
+ ):
296
+ # Async scheduling: Avoid scheduling an extra step when we are sure that
297
+ # the previous step has reached request.max_tokens. We don't schedule
298
+ # partial draft tokens since this prevents uniform decode optimizations.
299
+ req_index += 1
300
+ continue
301
+
302
+ num_new_tokens = (
303
+ request.num_tokens_with_spec
304
+ + request.num_output_placeholders
305
+ - request.num_computed_tokens
306
+ )
307
+ if 0 < self.scheduler_config.long_prefill_token_threshold < num_new_tokens:
308
+ num_new_tokens = self.scheduler_config.long_prefill_token_threshold
309
+ num_new_tokens = min(num_new_tokens, token_budget)
310
+
311
+ # Make sure the input position does not exceed the max model len.
312
+ # This is necessary when using spec decoding.
313
+ num_new_tokens = min(
314
+ num_new_tokens, self.max_model_len - 1 - request.num_computed_tokens
315
+ )
316
+
317
+ # Schedule encoder inputs.
318
+ encoder_inputs_to_schedule = None
319
+ external_load_encoder_input: list[int] = []
320
+ new_encoder_compute_budget = encoder_compute_budget
321
+ if request.has_encoder_inputs:
322
+ (
323
+ encoder_inputs_to_schedule,
324
+ num_new_tokens,
325
+ new_encoder_compute_budget,
326
+ external_load_encoder_input,
327
+ ) = self._try_schedule_encoder_inputs(
328
+ request,
329
+ request.num_computed_tokens,
330
+ num_new_tokens,
331
+ encoder_compute_budget,
332
+ shift_computed_tokens=1 if self.use_eagle else 0,
333
+ )
334
+
335
+ if num_new_tokens == 0:
336
+ # The request cannot be scheduled because one of the following
337
+ # reasons:
338
+ # 1. No new tokens to schedule. This may happen when
339
+ # (1) PP>1 and we have already scheduled all prompt tokens
340
+ # but they are not finished yet.
341
+ # (2) Async scheduling and the request has reached to either
342
+ # its max_total_tokens or max_model_len.
343
+ # 2. The encoder budget is exhausted.
344
+ # 3. The encoder cache is exhausted.
345
+ # NOTE(woosuk): Here, by doing `continue` instead of `break`,
346
+ # we do not strictly follow the FCFS scheduling policy and
347
+ # allow the lower-priority requests to be scheduled.
348
+ req_index += 1
349
+ continue
350
+
351
+ # Schedule newly needed KV blocks for the request.
352
+ with record_function_or_nullcontext("schedule: allocate_slots"):
353
+ while True:
354
+ new_blocks = self.kv_cache_manager.allocate_slots(
355
+ request,
356
+ num_new_tokens,
357
+ num_lookahead_tokens=self.num_lookahead_tokens,
358
+ )
359
+
360
+ if new_blocks is not None:
361
+ # The request can be scheduled.
362
+ break
363
+
364
+ # The request cannot be scheduled.
365
+ # Preempt the lowest-priority request.
366
+ if self.policy == SchedulingPolicy.PRIORITY:
367
+ preempted_req = max(
368
+ self.running,
369
+ key=lambda r: (r.priority, r.arrival_time),
370
+ )
371
+ self.running.remove(preempted_req)
372
+ if preempted_req in scheduled_running_reqs:
373
+ scheduled_running_reqs.remove(preempted_req)
374
+ token_budget += num_scheduled_tokens[
375
+ preempted_req.request_id
376
+ ]
377
+ req_to_new_blocks.pop(preempted_req.request_id)
378
+ num_scheduled_tokens.pop(preempted_req.request_id)
379
+ scheduled_spec_decode_tokens.pop(
380
+ preempted_req.request_id, None
381
+ )
382
+ preempted_encoder_inputs = scheduled_encoder_inputs.pop(
383
+ preempted_req.request_id, None
384
+ )
385
+ if preempted_encoder_inputs:
386
+ # Restore encoder compute budget if the preempted
387
+ # request had encoder inputs scheduled in this step.
388
+ num_embeds_to_restore = sum(
389
+ preempted_req.get_num_encoder_embeds(i)
390
+ for i in preempted_encoder_inputs
391
+ )
392
+ encoder_compute_budget += num_embeds_to_restore
393
+ req_index -= 1
394
+ else:
395
+ preempted_req = self.running.pop()
396
+
397
+ self._preempt_request(preempted_req, scheduled_timestamp)
398
+ preempted_reqs.append(preempted_req)
399
+ if preempted_req == request:
400
+ # No more request to preempt. Cannot schedule this request.
401
+ break
402
+
403
+ if new_blocks is None:
404
+ # Cannot schedule this request.
405
+ break
406
+
407
+ # Schedule the request.
408
+ scheduled_running_reqs.append(request)
409
+ req_to_new_blocks[request.request_id] = new_blocks
410
+ num_scheduled_tokens[request.request_id] = num_new_tokens
411
+ token_budget -= num_new_tokens
412
+ req_index += 1
413
+
414
+ # Speculative decode related.
415
+ if request.spec_token_ids:
416
+ num_scheduled_spec_tokens = (
417
+ num_new_tokens
418
+ + request.num_computed_tokens
419
+ - request.num_tokens
420
+ - request.num_output_placeholders
421
+ )
422
+ if num_scheduled_spec_tokens > 0:
423
+ # Trim spec_token_ids list to num_scheduled_spec_tokens.
424
+ del request.spec_token_ids[num_scheduled_spec_tokens:]
425
+ scheduled_spec_decode_tokens[request.request_id] = (
426
+ request.spec_token_ids
427
+ )
428
+ # New spec tokens will be set in `update_draft_token_ids` before the
429
+ # next step when applicable.
430
+ request.spec_token_ids = []
431
+
432
+ # Encoder-related.
433
+ if encoder_inputs_to_schedule:
434
+ scheduled_encoder_inputs[request.request_id] = (
435
+ encoder_inputs_to_schedule
436
+ )
437
+ # Allocate the encoder cache.
438
+ for i in encoder_inputs_to_schedule:
439
+ self.encoder_cache_manager.allocate(request, i)
440
+ encoder_compute_budget = new_encoder_compute_budget
441
+ if external_load_encoder_input:
442
+ for i in external_load_encoder_input:
443
+ self.encoder_cache_manager.allocate(request, i)
444
+ if self.ec_connector is not None:
445
+ self.ec_connector.update_state_after_alloc(request, i)
446
+
447
+ # Record the LoRAs in scheduled_running_reqs
448
+ scheduled_loras: set[int] = set()
449
+ if self.lora_config:
450
+ scheduled_loras = set(
451
+ req.lora_request.lora_int_id
452
+ for req in scheduled_running_reqs
453
+ if req.lora_request and req.lora_request.lora_int_id > 0
454
+ )
455
+ assert len(scheduled_loras) <= self.lora_config.max_loras
456
+
457
+ # Use a temporary RequestQueue to collect requests that need to be
458
+ # skipped and put back at the head of the waiting queue later
459
+ skipped_waiting_requests = create_request_queue(self.policy)
460
+
461
+ # Next, schedule the WAITING requests.
462
+ if not preempted_reqs:
463
+ while self.waiting and token_budget > 0:
464
+ if len(self.running) == self.max_num_running_reqs:
465
+ break
466
+
467
+ request = self.waiting.peek_request()
468
+
469
+ # KVTransfer: skip request if still waiting for remote kvs.
470
+ if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
471
+ is_ready = self._update_waiting_for_remote_kv(request)
472
+ if is_ready:
473
+ if request.num_preemptions:
474
+ # We must be loading for a resumed preemption
475
+ # rather than a new request.
476
+ request.status = RequestStatus.PREEMPTED
477
+ else:
478
+ request.status = RequestStatus.WAITING
479
+ else:
480
+ logger.debug(
481
+ "%s is still in WAITING_FOR_REMOTE_KVS state.",
482
+ request.request_id,
483
+ )
484
+ self.waiting.pop_request()
485
+ skipped_waiting_requests.prepend_request(request)
486
+ continue
487
+
488
+ # Skip request if the structured output request is still waiting
489
+ # for FSM compilation.
490
+ if request.status == RequestStatus.WAITING_FOR_FSM:
491
+ structured_output_req = request.structured_output_request
492
+ if structured_output_req and structured_output_req.grammar:
493
+ request.status = RequestStatus.WAITING
494
+ else:
495
+ self.waiting.pop_request()
496
+ skipped_waiting_requests.prepend_request(request)
497
+ continue
498
+
499
+ # Check that adding the request still respects the max_loras
500
+ # constraint.
501
+ if (
502
+ self.lora_config
503
+ and request.lora_request
504
+ and (
505
+ len(scheduled_loras) == self.lora_config.max_loras
506
+ and request.lora_request.lora_int_id not in scheduled_loras
507
+ )
508
+ ):
509
+ # Scheduling would exceed max_loras, skip.
510
+ self.waiting.pop_request()
511
+ skipped_waiting_requests.prepend_request(request)
512
+ continue
513
+
514
+ num_external_computed_tokens = 0
515
+ load_kv_async = False
516
+
517
+ # Get already-cached tokens.
518
+ if request.num_computed_tokens == 0:
519
+ # Get locally-cached tokens.
520
+ new_computed_blocks, num_new_local_computed_tokens = (
521
+ self.kv_cache_manager.get_computed_blocks(request)
522
+ )
523
+
524
+ # Get externally-cached tokens if using a KVConnector.
525
+ if self.connector is not None:
526
+ ext_tokens, load_kv_async = (
527
+ self.connector.get_num_new_matched_tokens(
528
+ request, num_new_local_computed_tokens
529
+ )
530
+ )
531
+
532
+ if ext_tokens is None:
533
+ # The request cannot be scheduled because
534
+ # the KVConnector couldn't determine
535
+ # the number of matched tokens.
536
+ self.waiting.pop_request()
537
+ skipped_waiting_requests.prepend_request(request)
538
+ continue
539
+
540
+ request.num_external_computed_tokens = ext_tokens
541
+ num_external_computed_tokens = ext_tokens
542
+
543
+ # Total computed tokens (local + external).
544
+ num_computed_tokens = (
545
+ num_new_local_computed_tokens + num_external_computed_tokens
546
+ )
547
+ else:
548
+ # KVTransfer: WAITING reqs have num_computed_tokens > 0
549
+ # after async KV recvs are completed.
550
+ new_computed_blocks = self.kv_cache_manager.empty_kv_cache_blocks
551
+ num_new_local_computed_tokens = 0
552
+ num_computed_tokens = request.num_computed_tokens
553
+
554
+ encoder_inputs_to_schedule = None
555
+ external_load_encoder_input = []
556
+ new_encoder_compute_budget = encoder_compute_budget
557
+
558
+ if load_kv_async:
559
+ # KVTransfer: loading remote KV, do not allocate for new work.
560
+ assert num_external_computed_tokens > 0
561
+ num_new_tokens = 0
562
+ else:
563
+ # Number of tokens to be scheduled.
564
+ # We use `request.num_tokens` instead of
565
+ # `request.num_prompt_tokens` to consider the resumed
566
+ # requests, which have output tokens.
567
+ num_new_tokens = request.num_tokens - num_computed_tokens
568
+ threshold = self.scheduler_config.long_prefill_token_threshold
569
+ if 0 < threshold < num_new_tokens:
570
+ num_new_tokens = threshold
571
+
572
+ # chunked prefill has to be enabled explicitly to allow
573
+ # pooling requests to be chunked
574
+ if (
575
+ not self.scheduler_config.enable_chunked_prefill
576
+ and num_new_tokens > token_budget
577
+ ):
578
+ # If chunked_prefill is disabled,
579
+ # we can stop the scheduling here.
580
+ break
581
+
582
+ num_new_tokens = min(num_new_tokens, token_budget)
583
+ assert num_new_tokens > 0
584
+
585
+ # Schedule encoder inputs.
586
+ if request.has_encoder_inputs:
587
+ (
588
+ encoder_inputs_to_schedule,
589
+ num_new_tokens,
590
+ new_encoder_compute_budget,
591
+ external_load_encoder_input,
592
+ ) = self._try_schedule_encoder_inputs(
593
+ request,
594
+ num_computed_tokens,
595
+ num_new_tokens,
596
+ encoder_compute_budget,
597
+ shift_computed_tokens=1 if self.use_eagle else 0,
598
+ )
599
+ if num_new_tokens == 0:
600
+ # The request cannot be scheduled.
601
+ break
602
+
603
+ # Handles an edge case when P/D Disaggregation
604
+ # is used with Spec Decoding where an
605
+ # extra block gets allocated which
606
+ # creates a mismatch between the number
607
+ # of local and remote blocks.
608
+ effective_lookahead_tokens = (
609
+ 0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens
610
+ )
611
+
612
+ num_encoder_tokens = (
613
+ self._num_encoder_max_input_tokens
614
+ if self.is_encoder_decoder and request.has_encoder_inputs
615
+ else 0
616
+ )
617
+
618
+ new_blocks = self.kv_cache_manager.allocate_slots(
619
+ request,
620
+ num_new_tokens,
621
+ num_new_computed_tokens=num_new_local_computed_tokens,
622
+ new_computed_blocks=new_computed_blocks,
623
+ num_lookahead_tokens=effective_lookahead_tokens,
624
+ num_external_computed_tokens=num_external_computed_tokens,
625
+ delay_cache_blocks=load_kv_async,
626
+ num_encoder_tokens=num_encoder_tokens,
627
+ )
628
+
629
+ if new_blocks is None:
630
+ # The request cannot be scheduled.
631
+
632
+ # NOTE: we need to untouch the request from the encode cache
633
+ # manager
634
+ if request.has_encoder_inputs:
635
+ self.encoder_cache_manager.free(request)
636
+ break
637
+
638
+ # KVTransfer: the connector uses this info to determine
639
+ # if a load is needed. Note that
640
+ # This information is used to determine if a load is
641
+ # needed for this request.
642
+ if self.connector is not None:
643
+ self.connector.update_state_after_alloc(
644
+ request,
645
+ self.kv_cache_manager.get_blocks(request.request_id),
646
+ num_external_computed_tokens,
647
+ )
648
+
649
+ # Request was already popped from self.waiting
650
+ # unless it was re-added above due to new_blocks being None.
651
+ request = self.waiting.pop_request()
652
+ if load_kv_async:
653
+ # If loading async, allocate memory and put request
654
+ # into the WAITING_FOR_REMOTE_KV state.
655
+ skipped_waiting_requests.prepend_request(request)
656
+ request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
657
+ continue
658
+
659
+ self._update_connector_prefix_cache_stats(request)
660
+
661
+ self.running.append(request)
662
+ if self.log_stats:
663
+ request.record_event(
664
+ EngineCoreEventType.SCHEDULED, scheduled_timestamp
665
+ )
666
+ if request.status == RequestStatus.WAITING:
667
+ scheduled_new_reqs.append(request)
668
+ elif request.status == RequestStatus.PREEMPTED:
669
+ scheduled_resumed_reqs.append(request)
670
+ else:
671
+ raise RuntimeError(f"Invalid request status: {request.status}")
672
+
673
+ if self.lora_config and request.lora_request:
674
+ scheduled_loras.add(request.lora_request.lora_int_id)
675
+ req_to_new_blocks[request.request_id] = (
676
+ self.kv_cache_manager.get_blocks(request.request_id)
677
+ )
678
+ num_scheduled_tokens[request.request_id] = num_new_tokens
679
+ token_budget -= num_new_tokens
680
+ request.status = RequestStatus.RUNNING
681
+ request.num_computed_tokens = num_computed_tokens
682
+ # Count the number of prefix cached tokens.
683
+ if request.num_cached_tokens < 0:
684
+ request.num_cached_tokens = num_computed_tokens
685
+ # Encoder-related.
686
+ if encoder_inputs_to_schedule:
687
+ scheduled_encoder_inputs[request.request_id] = (
688
+ encoder_inputs_to_schedule
689
+ )
690
+ # Allocate the encoder cache.
691
+ for i in encoder_inputs_to_schedule:
692
+ self.encoder_cache_manager.allocate(request, i)
693
+ encoder_compute_budget = new_encoder_compute_budget
694
+ # Allocate for external load encoder cache
695
+ if external_load_encoder_input:
696
+ for i in external_load_encoder_input:
697
+ self.encoder_cache_manager.allocate(request, i)
698
+ if self.ec_connector is not None:
699
+ self.ec_connector.update_state_after_alloc(request, i)
700
+ # Put back any skipped requests at the head of the waiting queue
701
+ if skipped_waiting_requests:
702
+ self.waiting.prepend_requests(skipped_waiting_requests)
703
+
704
+ # Check if the scheduling constraints are satisfied.
705
+ total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
706
+ assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
707
+
708
+ assert token_budget >= 0
709
+ assert len(self.running) <= self.max_num_running_reqs
710
+ # Since some requests in the RUNNING queue may not be scheduled in
711
+ # this step, the total number of scheduled requests can be smaller than
712
+ # len(self.running).
713
+ assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(
714
+ scheduled_running_reqs
715
+ ) <= len(self.running)
716
+
717
+ # Get the longest common prefix among all requests in the running queue.
718
+ # This can be potentially used for cascade attention.
719
+ num_common_prefix_blocks = [0] * len(self.kv_cache_config.kv_cache_groups)
720
+ with record_function_or_nullcontext("schedule: get_num_common_prefix_blocks"):
721
+ if self.running:
722
+ any_request = self.running[0]
723
+ num_common_prefix_blocks = (
724
+ self.kv_cache_manager.get_num_common_prefix_blocks(
725
+ any_request.request_id
726
+ )
727
+ )
728
+
729
+ # Construct the scheduler output.
730
+ if self.use_v2_model_runner:
731
+ scheduled_new_reqs = scheduled_new_reqs + scheduled_resumed_reqs
732
+ scheduled_resumed_reqs = []
733
+ new_reqs_data = [
734
+ NewRequestData.from_request(
735
+ req,
736
+ req_to_new_blocks[req.request_id].get_block_ids(),
737
+ req._all_token_ids,
738
+ )
739
+ for req in scheduled_new_reqs
740
+ ]
741
+ else:
742
+ new_reqs_data = [
743
+ NewRequestData.from_request(
744
+ req, req_to_new_blocks[req.request_id].get_block_ids()
745
+ )
746
+ for req in scheduled_new_reqs
747
+ ]
748
+
749
+ with record_function_or_nullcontext("schedule: make_cached_request_data"):
750
+ cached_reqs_data = self._make_cached_request_data(
751
+ scheduled_running_reqs,
752
+ scheduled_resumed_reqs,
753
+ num_scheduled_tokens,
754
+ scheduled_spec_decode_tokens,
755
+ req_to_new_blocks,
756
+ )
757
+
758
+ # Record the request ids that were scheduled in this step.
759
+ self.prev_step_scheduled_req_ids.clear()
760
+ self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys())
761
+
762
+ scheduler_output = SchedulerOutput(
763
+ scheduled_new_reqs=new_reqs_data,
764
+ scheduled_cached_reqs=cached_reqs_data,
765
+ num_scheduled_tokens=num_scheduled_tokens,
766
+ total_num_scheduled_tokens=total_num_scheduled_tokens,
767
+ scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
768
+ scheduled_encoder_inputs=scheduled_encoder_inputs,
769
+ num_common_prefix_blocks=num_common_prefix_blocks,
770
+ preempted_req_ids={req.request_id for req in preempted_reqs},
771
+ # finished_req_ids is an existing state in the scheduler,
772
+ # instead of being newly scheduled in this step.
773
+ # It contains the request IDs that are finished in between
774
+ # the previous and the current steps.
775
+ finished_req_ids=self.finished_req_ids,
776
+ free_encoder_mm_hashes=self.encoder_cache_manager.get_freed_mm_hashes(),
777
+ )
778
+
779
+ # NOTE(Kuntai): this function is designed for multiple purposes:
780
+ # 1. Plan the KV cache store
781
+ # 2. Wrap up all the KV cache load / save ops into an opaque object
782
+ # 3. Clear the internal states of the connector
783
+ if self.connector is not None:
784
+ meta: KVConnectorMetadata = self.connector.build_connector_meta(
785
+ scheduler_output
786
+ )
787
+ scheduler_output.kv_connector_metadata = meta
788
+
789
+ # Build the connector meta for ECConnector
790
+ if self.ec_connector is not None:
791
+ ec_meta: ECConnectorMetadata = self.ec_connector.build_connector_meta(
792
+ scheduler_output
793
+ )
794
+ scheduler_output.ec_connector_metadata = ec_meta
795
+
796
+ with record_function_or_nullcontext("schedule: update_after_schedule"):
797
+ self._update_after_schedule(scheduler_output)
798
+ return scheduler_output
799
+
800
+ def _preempt_request(self, request: Request, timestamp: float) -> None:
801
+ """Preempt a request and put it back to the waiting queue.
802
+
803
+ NOTE: The request should be popped from the running queue outside of this
804
+ method.
805
+ """
806
+ assert request.status == RequestStatus.RUNNING, (
807
+ "Only running requests can be preempted"
808
+ )
809
+ self.kv_cache_manager.free(request)
810
+ self.encoder_cache_manager.free(request)
811
+ request.status = RequestStatus.PREEMPTED
812
+ request.num_computed_tokens = 0
813
+ request.spec_token_ids.clear()
814
+ request.num_preemptions += 1
815
+ if self.log_stats:
816
+ request.record_event(EngineCoreEventType.PREEMPTED, timestamp)
817
+
818
+ # Put the request back to the waiting queue.
819
+ self.waiting.prepend_request(request)
820
+
821
+ def _update_after_schedule(self, scheduler_output: SchedulerOutput) -> None:
822
+ # Advance the number of computed tokens for the request AFTER
823
+ # the request is scheduled.
824
+ # 1. The scheduler_output of the current step has to include the
825
+ # original number of scheduled tokens to determine input IDs.
826
+ # 2. Advance the number of computed tokens here allowing us to
827
+ # schedule the prefill request again immediately in the next
828
+ # scheduling step.
829
+ # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
830
+ # computed tokens will be adjusted in update_from_output.
831
+ num_scheduled_tokens = scheduler_output.num_scheduled_tokens
832
+ for req_id, num_scheduled_token in num_scheduled_tokens.items():
833
+ request = self.requests[req_id]
834
+ request.num_computed_tokens += num_scheduled_token
835
+
836
+ # NOTE: _free_encoder_inputs relies on num_computed_tokens, which
837
+ # may be updated again in _update_from_output for speculative
838
+ # decoding. However, it is safe to call the method here because
839
+ # encoder inputs are always part of the prompt, not the output,
840
+ # and thus are unaffected by speculative decoding.
841
+ if request.has_encoder_inputs:
842
+ self._free_encoder_inputs(request)
843
+
844
+ # Clear the finished request IDs.
845
+ # NOTE: We shouldn't do self.finished_req_ids.clear() here because
846
+ # it will also affect the scheduler output.
847
+ self.finished_req_ids = set()
848
+
849
+ def _make_cached_request_data(
850
+ self,
851
+ running_reqs: list[Request],
852
+ resumed_reqs: list[Request],
853
+ num_scheduled_tokens: dict[str, int],
854
+ spec_decode_tokens: dict[str, list[int]],
855
+ req_to_new_blocks: dict[str, KVCacheBlocks],
856
+ ) -> CachedRequestData:
857
+ req_ids: list[str] = []
858
+ new_token_ids: list[list[int]] = []
859
+ new_block_ids: list[tuple[list[int], ...] | None] = []
860
+ all_token_ids: dict[str, list[int]] = {}
861
+ num_computed_tokens: list[int] = []
862
+ num_output_tokens: list[int] = []
863
+ resumed_req_ids = set()
864
+
865
+ num_running_reqs = len(running_reqs)
866
+ for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)):
867
+ req_id = req.request_id
868
+ req_ids.append(req_id)
869
+ if self.use_pp:
870
+ # When using PP, the scheduler sends the sampled tokens back,
871
+ # because there's no direct communication between the first-
872
+ # stage worker and the last-stage worker. Otherwise, we don't
873
+ # need to send the sampled tokens back because the model runner
874
+ # will cache them.
875
+ num_tokens = num_scheduled_tokens[req_id] - len(
876
+ spec_decode_tokens.get(req_id, ())
877
+ )
878
+ token_ids = req.all_token_ids[
879
+ req.num_computed_tokens : req.num_computed_tokens + num_tokens
880
+ ]
881
+ new_token_ids.append(token_ids)
882
+ scheduled_in_prev_step = req_id in self.prev_step_scheduled_req_ids
883
+ if idx >= num_running_reqs:
884
+ assert not scheduled_in_prev_step
885
+ resumed_req_ids.add(req_id)
886
+ if not scheduled_in_prev_step:
887
+ all_token_ids[req_id] = req.all_token_ids.copy()
888
+ new_block_ids.append(
889
+ req_to_new_blocks[req_id].get_block_ids(allow_none=True)
890
+ )
891
+ num_computed_tokens.append(req.num_computed_tokens)
892
+ num_output_tokens.append(
893
+ req.num_output_tokens + req.num_output_placeholders
894
+ )
895
+
896
+ return CachedRequestData(
897
+ req_ids=req_ids,
898
+ resumed_req_ids=resumed_req_ids,
899
+ new_token_ids=new_token_ids,
900
+ all_token_ids=all_token_ids,
901
+ new_block_ids=new_block_ids,
902
+ num_computed_tokens=num_computed_tokens,
903
+ num_output_tokens=num_output_tokens,
904
+ )
905
+
906
+ def _try_schedule_encoder_inputs(
907
+ self,
908
+ request: Request,
909
+ num_computed_tokens: int,
910
+ num_new_tokens: int,
911
+ encoder_compute_budget: int,
912
+ shift_computed_tokens: int = 0,
913
+ ) -> tuple[list[int], int, int, list[int]]:
914
+ """
915
+ Determine which encoder inputs need to be scheduled in the current step,
916
+ and update `num_new_tokens` and encoder token budget accordingly.
917
+
918
+ An encoder input will be scheduled if:
919
+ - Its output tokens overlap with the range of tokens being computed
920
+ in this step, i.e.,
921
+ [num_computed_tokens, num_computed_tokens + num_new_tokens).
922
+ - It is not already computed and stored in the encoder cache.
923
+ - It is not exist on remote encoder cache (via ECConnector)
924
+ - There is sufficient encoder token budget to process it.
925
+ - The encoder cache has space to store it.
926
+
927
+ If an encoder input cannot be scheduled due to cache or budget
928
+ limitations, the method adjusts `num_new_tokens` to schedule only the
929
+ decoder tokens up to just before the unschedulable encoder input.
930
+
931
+ Note that num_computed_tokens includes both locally cached
932
+ blocks and externally cached blocks (via KVConnector).
933
+ """
934
+ if num_new_tokens == 0 or not request.has_encoder_inputs:
935
+ return [], num_new_tokens, encoder_compute_budget, []
936
+ encoder_inputs_to_schedule: list[int] = []
937
+ mm_features = request.mm_features
938
+ assert mm_features is not None
939
+ assert len(mm_features) > 0
940
+ external_load_encoder_input = []
941
+
942
+ # Check remote cache first
943
+ if self.ec_connector is not None:
944
+ remote_cache_has_item = self.ec_connector.has_caches(request)
945
+ # NOTE: since scheduler operates on the request level (possibly with
946
+ # multiple encoder inputs per request), we need to create temporary
947
+ # trackers for accounting at the encoder input level.
948
+ mm_hashes_to_schedule = set()
949
+ num_embeds_to_schedule = 0
950
+ for i, mm_feature in enumerate(mm_features):
951
+ start_pos = mm_feature.mm_position.offset
952
+ num_encoder_tokens = mm_feature.mm_position.length
953
+ num_encoder_embeds = mm_feature.mm_position.get_num_embeds
954
+
955
+ # The encoder output is needed if the two ranges overlap:
956
+ # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
957
+ # [start_pos, start_pos + num_encoder_tokens)
958
+ if (
959
+ start_pos
960
+ >= num_computed_tokens + num_new_tokens + shift_computed_tokens
961
+ ):
962
+ # The encoder input is not needed in this step.
963
+ break
964
+
965
+ if self.is_encoder_decoder and num_computed_tokens > 0:
966
+ assert start_pos == 0, (
967
+ "Encoder input should be processed at the beginning of "
968
+ "the sequence when encoder-decoder models are used."
969
+ )
970
+ # Encoder input has already been computed
971
+ # The calculation here is a bit different. We don't turn encoder
972
+ # output into tokens that get processed by the decoder and
973
+ # reflected in num_computed_tokens. Instead, start_pos reflects
974
+ # the position where we need to ensure we calculate encoder
975
+ # inputs. This should always be 0 to ensure we calculate encoder
976
+ # inputs before running the decoder. Once we've calculated some
977
+ # decoder tokens (num_computed_tokens > 0), then we know we
978
+ # already calculated encoder inputs and can skip here.
979
+ continue
980
+ elif start_pos + num_encoder_tokens <= num_computed_tokens:
981
+ # The encoder input is already computed and stored
982
+ # in the decoder's KV cache.
983
+ continue
984
+
985
+ if not self.is_encoder_decoder:
986
+ # We are not using the encoder cache for encoder-decoder models,
987
+ # yet.
988
+ if request.mm_features[i].identifier in mm_hashes_to_schedule:
989
+ # The same encoder input has already been scheduled in the
990
+ # current step.
991
+ continue
992
+
993
+ if self.encoder_cache_manager.check_and_update_cache(request, i):
994
+ # The encoder input is already computed and cached from a
995
+ # previous step.
996
+ continue
997
+
998
+ # If no encoder input chunking is allowed, we do not want to
999
+ # partially schedule a multimodal item. If the scheduled range would
1000
+ # only cover part of the mm input, roll back to before the mm item.
1001
+ if (
1002
+ self.scheduler_config.disable_chunked_mm_input
1003
+ and num_computed_tokens < start_pos
1004
+ and (num_computed_tokens + num_new_tokens)
1005
+ < (start_pos + num_encoder_tokens)
1006
+ ):
1007
+ num_new_tokens = start_pos - num_computed_tokens
1008
+ break
1009
+ if not self.encoder_cache_manager.can_allocate(
1010
+ request, i, encoder_compute_budget, num_embeds_to_schedule
1011
+ ):
1012
+ # The encoder cache is full or the encoder budget is exhausted.
1013
+ # NOTE(woosuk): We assume that the encoder input tokens should
1014
+ # be processed altogether, as the encoder usually uses
1015
+ # bidirectional attention.
1016
+ if num_computed_tokens + shift_computed_tokens < start_pos:
1017
+ # We only schedule the decoder tokens just before the
1018
+ # encoder input.
1019
+ num_new_tokens = start_pos - (
1020
+ num_computed_tokens + shift_computed_tokens
1021
+ )
1022
+ else:
1023
+ # Because of prefix caching, num_computed_tokens is greater
1024
+ # than start_pos even though its encoder input is not
1025
+ # available. In this case, we can't schedule any token for
1026
+ # the request in this step.
1027
+ num_new_tokens = 0
1028
+ break
1029
+
1030
+ # Calculate the number of embeddings to schedule in the current range
1031
+ # of scheduled encoder placholder tokens.
1032
+ start_idx_rel = max(0, num_computed_tokens - start_pos)
1033
+ end_idx_rel = min(
1034
+ num_encoder_tokens, num_computed_tokens + num_new_tokens - start_pos
1035
+ )
1036
+ curr_embeds_start, curr_embeds_end = (
1037
+ mm_feature.mm_position.get_embeds_indices_in_range(
1038
+ start_idx_rel, end_idx_rel
1039
+ )
1040
+ )
1041
+ # There's no embeddings in the current range of encoder placeholder tokens
1042
+ # so we can skip the encoder input.
1043
+ if curr_embeds_end - curr_embeds_start == 0:
1044
+ continue
1045
+
1046
+ if self.ec_connector is not None and remote_cache_has_item[i]:
1047
+ mm_hashes_to_schedule.add(request.mm_features[i].identifier)
1048
+ external_load_encoder_input.append(i)
1049
+ num_embeds_to_schedule += num_encoder_embeds
1050
+ continue
1051
+
1052
+ num_embeds_to_schedule += num_encoder_embeds
1053
+ encoder_compute_budget -= num_encoder_embeds
1054
+ mm_hashes_to_schedule.add(request.mm_features[i].identifier)
1055
+ encoder_inputs_to_schedule.append(i)
1056
+
1057
+ return (
1058
+ encoder_inputs_to_schedule,
1059
+ num_new_tokens,
1060
+ encoder_compute_budget,
1061
+ external_load_encoder_input,
1062
+ )
1063
+
1064
+ def get_grammar_bitmask(
1065
+ self, scheduler_output: SchedulerOutput
1066
+ ) -> GrammarOutput | None:
1067
+ # Collect list of scheduled request ids that use structured output.
1068
+ # The corresponding rows of the bitmask will be in this order.
1069
+ # PERF: in case of chunked prefill,
1070
+ # request might not include any new tokens.
1071
+ # Therefore, we might introduce some additional
1072
+ # cycle to fill in the bitmask, which could be a big no-op.
1073
+ structured_output_request_ids = [
1074
+ req_id
1075
+ for req_id in scheduler_output.num_scheduled_tokens
1076
+ if (req := self.requests.get(req_id)) and req.use_structured_output
1077
+ ]
1078
+ if not structured_output_request_ids:
1079
+ return None
1080
+
1081
+ bitmask = self.structured_output_manager.grammar_bitmask(
1082
+ self.requests,
1083
+ structured_output_request_ids,
1084
+ scheduler_output.scheduled_spec_decode_tokens,
1085
+ )
1086
+ return GrammarOutput(structured_output_request_ids, bitmask)
1087
+
1088
+ def update_from_output(
1089
+ self,
1090
+ scheduler_output: SchedulerOutput,
1091
+ model_runner_output: ModelRunnerOutput,
1092
+ ) -> dict[int, EngineCoreOutputs]:
1093
+ sampled_token_ids = model_runner_output.sampled_token_ids
1094
+ logprobs = model_runner_output.logprobs
1095
+ prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
1096
+ num_scheduled_tokens = scheduler_output.num_scheduled_tokens
1097
+ pooler_outputs = model_runner_output.pooler_output
1098
+ num_nans_in_logits = model_runner_output.num_nans_in_logits
1099
+ kv_connector_output = model_runner_output.kv_connector_output
1100
+ cudagraph_stats = model_runner_output.cudagraph_stats
1101
+
1102
+ perf_stats: PerfStats | None = None
1103
+ if self.perf_metrics and self.perf_metrics.is_enabled():
1104
+ perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output)
1105
+
1106
+ outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
1107
+ spec_decoding_stats: SpecDecodingStats | None = None
1108
+ kv_connector_stats: KVConnectorStats | None = (
1109
+ kv_connector_output.kv_connector_stats if kv_connector_output else None
1110
+ )
1111
+ if kv_connector_stats and self.connector:
1112
+ kv_stats = self.connector.get_kv_connector_stats()
1113
+ if kv_stats:
1114
+ kv_connector_stats = kv_connector_stats.aggregate(kv_stats)
1115
+
1116
+ failed_kv_load_req_ids = None
1117
+ if kv_connector_output and kv_connector_output.invalid_block_ids:
1118
+ # These blocks contain externally computed tokens that failed to
1119
+ # load. Identify affected requests and adjust their computed token
1120
+ # count to trigger recomputation of the invalid blocks.
1121
+ failed_kv_load_req_ids = self._handle_invalid_blocks(
1122
+ kv_connector_output.invalid_block_ids
1123
+ )
1124
+
1125
+ # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more,
1126
+ # the below loop can be a performance bottleneck. We should do our best
1127
+ # to avoid expensive operations inside the loop.
1128
+ stopped_running_reqs: set[Request] = set()
1129
+ stopped_preempted_reqs: set[Request] = set()
1130
+ for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
1131
+ assert num_tokens_scheduled > 0
1132
+ if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids:
1133
+ # skip failed or rescheduled requests from KV load failure
1134
+ continue
1135
+ request = self.requests.get(req_id)
1136
+ if request is None:
1137
+ # The request is already finished. This can happen if the
1138
+ # request is aborted while the model is executing it (e.g.,
1139
+ # in pipeline parallelism).
1140
+ continue
1141
+
1142
+ req_index = model_runner_output.req_id_to_index[req_id]
1143
+ generated_token_ids = (
1144
+ sampled_token_ids[req_index] if sampled_token_ids else []
1145
+ )
1146
+
1147
+ scheduled_spec_token_ids = (
1148
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id)
1149
+ )
1150
+ if scheduled_spec_token_ids:
1151
+ num_draft_tokens = len(scheduled_spec_token_ids)
1152
+ num_accepted = len(generated_token_ids) - 1
1153
+ num_rejected = num_draft_tokens - num_accepted
1154
+ # num_computed_tokens represents the number of tokens
1155
+ # processed in the current step, considering scheduled
1156
+ # tokens and rejections. If some tokens are rejected,
1157
+ # num_computed_tokens is decreased by the number of rejected
1158
+ # tokens.
1159
+ if request.num_computed_tokens > 0:
1160
+ request.num_computed_tokens -= num_rejected
1161
+ # If async scheduling, num_output_placeholders also includes
1162
+ # the scheduled spec tokens count and so is similarly adjusted.
1163
+ if request.num_output_placeholders > 0:
1164
+ request.num_output_placeholders -= num_rejected
1165
+ spec_decoding_stats = self.make_spec_decoding_stats(
1166
+ spec_decoding_stats,
1167
+ num_draft_tokens=num_draft_tokens,
1168
+ num_accepted_tokens=num_accepted,
1169
+ num_invalid_spec_tokens=scheduler_output.num_invalid_spec_tokens,
1170
+ request_id=req_id,
1171
+ )
1172
+
1173
+ stopped = False
1174
+ new_logprobs = None
1175
+ new_token_ids = generated_token_ids
1176
+ pooler_output = pooler_outputs[req_index] if pooler_outputs else None
1177
+ kv_transfer_params = None
1178
+ status_before_stop = request.status
1179
+
1180
+ # Check for stop and update request status.
1181
+ if new_token_ids:
1182
+ new_token_ids, stopped = self._update_request_with_output(
1183
+ request, new_token_ids
1184
+ )
1185
+ elif request.pooling_params and pooler_output is not None:
1186
+ # Pooling stops as soon as there is output.
1187
+ request.status = RequestStatus.FINISHED_STOPPED
1188
+ stopped = True
1189
+
1190
+ routed_experts = None
1191
+ if stopped:
1192
+ if self.vllm_config.model_config.enable_return_routed_experts:
1193
+ kv_blocks = self.kv_cache_manager.get_blocks(request.request_id)
1194
+ block_ids = kv_blocks.get_block_ids()[0]
1195
+ num_tokens = request.num_tokens - 1
1196
+
1197
+ # compute slot mapping
1198
+ block_ids_array = np.array(block_ids, dtype=np.int32)
1199
+ num_blocks = len(block_ids)
1200
+ block_size = self.block_size
1201
+
1202
+ # generate block offsets
1203
+ block_offsets = np.arange(0, block_size)
1204
+
1205
+ # compute slot mapping: slot = block_id * block_size + offset
1206
+ slot_mapping = (
1207
+ block_offsets.reshape((1, block_size))
1208
+ + block_ids_array.reshape((num_blocks, 1)) * block_size
1209
+ ).flatten()[:num_tokens]
1210
+
1211
+ routed_experts = self.routed_experts_reader.get_routed_experts(
1212
+ indices=slot_mapping
1213
+ )
1214
+ kv_transfer_params = self._free_request(request)
1215
+ if status_before_stop == RequestStatus.RUNNING:
1216
+ stopped_running_reqs.add(request)
1217
+ else:
1218
+ stopped_preempted_reqs.add(request)
1219
+
1220
+ # Extract sample logprobs if needed.
1221
+ if (
1222
+ request.sampling_params is not None
1223
+ and request.sampling_params.logprobs is not None
1224
+ and logprobs
1225
+ ):
1226
+ new_logprobs = logprobs.slice_request(req_index, len(new_token_ids))
1227
+
1228
+ if new_token_ids and self.structured_output_manager.should_advance(request):
1229
+ struct_output_request = request.structured_output_request
1230
+ assert struct_output_request is not None
1231
+ assert struct_output_request.grammar is not None
1232
+ ok = struct_output_request.grammar.accept_tokens(req_id, new_token_ids)
1233
+ if not ok:
1234
+ logger.warning(
1235
+ "Unexpected: grammar rejected tokens %s for request %s.",
1236
+ new_token_ids,
1237
+ req_id,
1238
+ )
1239
+
1240
+ if num_nans_in_logits is not None and req_id in num_nans_in_logits:
1241
+ request.num_nans_in_logits = num_nans_in_logits[req_id]
1242
+
1243
+ # Get prompt logprobs for this request.
1244
+ prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
1245
+ if new_token_ids or pooler_output is not None or kv_transfer_params:
1246
+ # Add EngineCoreOutput for this Request.
1247
+ outputs[request.client_index].append(
1248
+ EngineCoreOutput(
1249
+ request_id=req_id,
1250
+ new_token_ids=new_token_ids,
1251
+ finish_reason=request.get_finished_reason(),
1252
+ new_logprobs=new_logprobs,
1253
+ new_prompt_logprobs_tensors=prompt_logprobs_tensors,
1254
+ pooling_output=pooler_output,
1255
+ stop_reason=request.stop_reason,
1256
+ events=request.take_events(),
1257
+ kv_transfer_params=kv_transfer_params,
1258
+ trace_headers=request.trace_headers,
1259
+ num_cached_tokens=request.num_cached_tokens,
1260
+ routed_experts=routed_experts,
1261
+ num_nans_in_logits=request.num_nans_in_logits,
1262
+ )
1263
+ )
1264
+ else:
1265
+ # Invariant: EngineCore returns no partial prefill outputs.
1266
+ assert not prompt_logprobs_tensors
1267
+
1268
+ # Remove the stopped requests from the running and waiting queues.
1269
+ if stopped_running_reqs:
1270
+ self.running = remove_all(self.running, stopped_running_reqs)
1271
+ if stopped_preempted_reqs:
1272
+ # This is a rare case and unlikely to impact performance.
1273
+ self.waiting.remove_requests(stopped_preempted_reqs)
1274
+
1275
+ if failed_kv_load_req_ids and not self.recompute_kv_load_failures:
1276
+ requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids]
1277
+ self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR)
1278
+ for request in requests:
1279
+ outputs[request.client_index].append(
1280
+ EngineCoreOutput(
1281
+ request_id=request.request_id,
1282
+ new_token_ids=[],
1283
+ finish_reason=request.get_finished_reason(),
1284
+ events=request.take_events(),
1285
+ trace_headers=request.trace_headers,
1286
+ num_cached_tokens=request.num_cached_tokens,
1287
+ )
1288
+ )
1289
+
1290
+ # KV Connector: update state for finished KV Transfers.
1291
+ if kv_connector_output:
1292
+ self._update_from_kv_xfer_finished(kv_connector_output)
1293
+
1294
+ # collect KV cache events from KV cache manager
1295
+ events = self.kv_cache_manager.take_events()
1296
+
1297
+ # collect KV cache events from connector
1298
+ if self.connector is not None:
1299
+ connector_events = self.connector.take_events()
1300
+ if connector_events:
1301
+ if events is None:
1302
+ events = list(connector_events)
1303
+ else:
1304
+ events.extend(connector_events)
1305
+
1306
+ # publish collected KV cache events
1307
+ if events:
1308
+ batch = KVEventBatch(ts=time.time(), events=events)
1309
+ self.kv_event_publisher.publish(batch)
1310
+
1311
+ # Create EngineCoreOutputs for all clients that have requests with
1312
+ # outputs in this step.
1313
+ engine_core_outputs = {
1314
+ client_index: EngineCoreOutputs(outputs=outs)
1315
+ for client_index, outs in outputs.items()
1316
+ }
1317
+
1318
+ finished_req_ids = self.finished_req_ids_dict
1319
+ if finished_req_ids:
1320
+ # Include ids of requests that finished since last outputs
1321
+ # were sent.
1322
+ for client_index, finished_set in finished_req_ids.items():
1323
+ # Set finished request set in EngineCoreOutputs for this client.
1324
+ if (eco := engine_core_outputs.get(client_index)) is not None:
1325
+ eco.finished_requests = finished_set
1326
+ else:
1327
+ engine_core_outputs[client_index] = EngineCoreOutputs(
1328
+ finished_requests=finished_set
1329
+ )
1330
+ finished_req_ids.clear()
1331
+
1332
+ if (
1333
+ stats := self.make_stats(
1334
+ spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats
1335
+ )
1336
+ ) is not None:
1337
+ # Return stats to only one of the front-ends.
1338
+ if (eco := next(iter(engine_core_outputs.values()), None)) is None:
1339
+ # We must return the stats even if there are no request
1340
+ # outputs this step.
1341
+ engine_core_outputs[0] = eco = EngineCoreOutputs()
1342
+ eco.scheduler_stats = stats
1343
+
1344
+ return engine_core_outputs
1345
+
1346
+ def _update_request_with_output(
1347
+ self, request: Request, new_token_ids: list[int]
1348
+ ) -> tuple[list[int], bool]:
1349
+ # Append generated tokens and check for stop. Note that if
1350
+ # a request is still being prefilled, we expect the model runner
1351
+ # to return empty token ids for the request.
1352
+ stopped = False
1353
+ for num_new, output_token_id in enumerate(new_token_ids, 1):
1354
+ request.append_output_token_ids(output_token_id)
1355
+
1356
+ # Check for stop and update request state.
1357
+ # This must be called before we make the EngineCoreOutput.
1358
+ stopped = check_stop(request, self.max_model_len)
1359
+ if stopped:
1360
+ del new_token_ids[num_new:] # Trim new tokens if needed.
1361
+ break
1362
+ return new_token_ids, stopped
1363
+
1364
+ def _free_encoder_inputs(self, request: Request) -> None:
1365
+ cached_encoder_input_ids = self.encoder_cache_manager.get_cached_input_ids(
1366
+ request
1367
+ )
1368
+ # OPTIMIZATION: Avoid list(set) if the set is empty.
1369
+ if not cached_encoder_input_ids:
1370
+ return
1371
+
1372
+ # Here, we use list(set) to avoid modifying the set while iterating
1373
+ # over it.
1374
+ for input_id in list(cached_encoder_input_ids):
1375
+ mm_feature = request.mm_features[input_id]
1376
+ start_pos = mm_feature.mm_position.offset
1377
+ num_tokens = mm_feature.mm_position.length
1378
+ if self.is_encoder_decoder and request.num_computed_tokens > 0:
1379
+ # With Whisper, as soon as we've generated a single token,
1380
+ # we know we're done with the encoder input. Cross Attention
1381
+ # KVs have been calculated and cached already.
1382
+ self.encoder_cache_manager.free_encoder_input(request, input_id)
1383
+ elif start_pos + num_tokens <= request.num_computed_tokens:
1384
+ # The encoder output is already processed and stored
1385
+ # in the decoder's KV cache.
1386
+ self.encoder_cache_manager.free_encoder_input(request, input_id)
1387
+
1388
+ def update_draft_token_ids(self, draft_token_ids: DraftTokenIds) -> None:
1389
+ for req_id, spec_token_ids in zip(
1390
+ draft_token_ids.req_ids,
1391
+ draft_token_ids.draft_token_ids,
1392
+ ):
1393
+ request = self.requests.get(req_id)
1394
+ if request is None or request.is_finished():
1395
+ # The request may have been finished. Skip.
1396
+ continue
1397
+
1398
+ # Add newly generated spec token ids to the request.
1399
+ if self.structured_output_manager.should_advance(request):
1400
+ metadata = request.structured_output_request
1401
+ spec_token_ids = metadata.grammar.validate_tokens(spec_token_ids) # type: ignore[union-attr]
1402
+ request.spec_token_ids = spec_token_ids
1403
+
1404
+ def update_draft_token_ids_in_output(
1405
+ self, draft_token_ids: DraftTokenIds, scheduler_output: SchedulerOutput
1406
+ ) -> None:
1407
+ num_invalid_spec_tokens: dict[str, int] = {}
1408
+
1409
+ sched_spec_tokens = scheduler_output.scheduled_spec_decode_tokens
1410
+ for req_id, spec_token_ids in zip(
1411
+ draft_token_ids.req_ids,
1412
+ draft_token_ids.draft_token_ids,
1413
+ ):
1414
+ request = self.requests.get(req_id)
1415
+ if request is None or request.is_finished():
1416
+ # The request may have been finished. Skip.
1417
+ continue
1418
+
1419
+ placeholder_spec_tokens = sched_spec_tokens.get(req_id)
1420
+ if not placeholder_spec_tokens:
1421
+ continue
1422
+
1423
+ orig_num_spec_tokens = len(placeholder_spec_tokens)
1424
+ # Trim drafts to scheduled number of spec tokens
1425
+ # (needed for chunked prefill case for example).
1426
+ del spec_token_ids[orig_num_spec_tokens:]
1427
+ # Filter out spec tokens which do not adhere to the grammar.
1428
+ if self.structured_output_manager.should_advance(request):
1429
+ metadata = request.structured_output_request
1430
+ assert metadata is not None and metadata.grammar is not None
1431
+ spec_token_ids = metadata.grammar.validate_tokens(spec_token_ids)
1432
+ # Pad to original number of spec tokens.
1433
+ num_invalid_tokens = orig_num_spec_tokens - len(spec_token_ids)
1434
+ if num_invalid_tokens:
1435
+ spec_token_ids.extend([-1] * num_invalid_tokens)
1436
+ num_invalid_spec_tokens[req_id] = num_invalid_tokens
1437
+
1438
+ sched_spec_tokens[req_id] = spec_token_ids
1439
+
1440
+ scheduler_output.num_invalid_spec_tokens = num_invalid_spec_tokens
1441
+
1442
+ def get_request_counts(self) -> tuple[int, int]:
1443
+ """Returns (num_running_reqs, num_waiting_reqs)."""
1444
+ return len(self.running), len(self.waiting)
1445
+
1446
+ def add_request(self, request: Request) -> None:
1447
+ self.waiting.add_request(request)
1448
+ self.requests[request.request_id] = request
1449
+ if self.log_stats:
1450
+ request.record_event(EngineCoreEventType.QUEUED)
1451
+
1452
+ def finish_requests(
1453
+ self, request_ids: str | Iterable[str], finished_status: RequestStatus
1454
+ ) -> None:
1455
+ """Handles the finish signal from outside the scheduler.
1456
+
1457
+ For example, the API server can abort a request when the client
1458
+ disconnects.
1459
+ """
1460
+ assert RequestStatus.is_finished(finished_status)
1461
+ if isinstance(request_ids, str):
1462
+ request_ids = (request_ids,)
1463
+ else:
1464
+ request_ids = set(request_ids)
1465
+
1466
+ running_requests_to_remove = set()
1467
+ waiting_requests_to_remove = []
1468
+ valid_requests = []
1469
+
1470
+ # First pass: collect requests to remove from queues
1471
+ for req_id in request_ids:
1472
+ request = self.requests.get(req_id)
1473
+ if request is None or request.is_finished():
1474
+ # Invalid request ID.
1475
+ continue
1476
+
1477
+ valid_requests.append(request)
1478
+ if request.status == RequestStatus.RUNNING:
1479
+ running_requests_to_remove.add(request)
1480
+ else:
1481
+ waiting_requests_to_remove.append(request)
1482
+
1483
+ # Remove all requests from queues at once for better efficiency
1484
+ if running_requests_to_remove:
1485
+ self.running = remove_all(self.running, running_requests_to_remove)
1486
+ if waiting_requests_to_remove:
1487
+ self.waiting.remove_requests(waiting_requests_to_remove)
1488
+
1489
+ # Second pass: set status and free requests
1490
+ for request in valid_requests:
1491
+ request.status = finished_status
1492
+ self._free_request(request)
1493
+
1494
+ def _free_request(self, request: Request) -> dict[str, Any] | None:
1495
+ assert request.is_finished()
1496
+
1497
+ delay_free_blocks, kv_xfer_params = self._connector_finished(request)
1498
+ self.encoder_cache_manager.free(request)
1499
+ request_id = request.request_id
1500
+ self.finished_req_ids.add(request_id)
1501
+ if self.finished_req_ids_dict is not None:
1502
+ self.finished_req_ids_dict[request.client_index].add(request_id)
1503
+
1504
+ if not delay_free_blocks:
1505
+ self._free_blocks(request)
1506
+
1507
+ return kv_xfer_params
1508
+
1509
+ def _free_blocks(self, request: Request):
1510
+ assert request.is_finished()
1511
+ self.kv_cache_manager.free(request)
1512
+ del self.requests[request.request_id]
1513
+
1514
+ def get_num_unfinished_requests(self) -> int:
1515
+ return len(self.waiting) + len(self.running)
1516
+
1517
+ def has_finished_requests(self) -> bool:
1518
+ return len(self.finished_req_ids) > 0
1519
+
1520
+ def reset_prefix_cache(
1521
+ self, reset_running_requests: bool = False, reset_connector: bool = False
1522
+ ) -> bool:
1523
+ """Reset the KV prefix cache.
1524
+
1525
+ If reset_running_requests is True, all the running requests will be
1526
+ preempted and moved to the waiting queue.
1527
+ Otherwise, this method will only reset the KV prefix cache when there
1528
+ is no running requests taking KV cache.
1529
+ """
1530
+ if reset_running_requests:
1531
+ # For logging.
1532
+ timestamp = time.monotonic()
1533
+ # Invalidate all the current running requests KV's by pushing them to
1534
+ # the waiting queue. In this case, we can reduce the ref count of all
1535
+ # the kv blocks to 0 and thus we can make sure the reset is successful.
1536
+ # Preempt in reverse order so the requests will be added back to the
1537
+ # running queue in FIFO order.
1538
+ while self.running:
1539
+ request = self.running.pop()
1540
+ self._preempt_request(request, timestamp)
1541
+ # NOTE(zhuohan): For async scheduling, we need to discard the latest
1542
+ # output token on the fly to avoid a redundant repetitive output token.
1543
+ request.num_output_placeholders = 0
1544
+ request.discard_latest_async_tokens = True
1545
+
1546
+ # Clear scheduled request ids cache. Since we are forcing preemption
1547
+ # + resumption in the same step, we must act as if these requests were
1548
+ # not scheduled in the prior step. They will be flushed from the
1549
+ # persistent batch in the model runner.
1550
+ self.prev_step_scheduled_req_ids.clear()
1551
+
1552
+ reset_successful = self.kv_cache_manager.reset_prefix_cache()
1553
+ if reset_running_requests and not reset_successful:
1554
+ raise RuntimeError(
1555
+ "Failed to reset KV cache even when all the running requests are "
1556
+ "preempted and moved to the waiting queue. This is likely due to "
1557
+ "the presence of running requests waiting for remote KV transfer, "
1558
+ "which is not supported yet."
1559
+ )
1560
+
1561
+ if reset_connector:
1562
+ reset_successful = self.reset_connector_cache() and reset_successful
1563
+
1564
+ return reset_successful
1565
+
1566
+ def reset_connector_cache(self) -> bool:
1567
+ if self.connector is None:
1568
+ logger.warning("reset_connector called but no KV connector is configured.")
1569
+ return False
1570
+
1571
+ if self.connector.reset_cache() is False:
1572
+ return False
1573
+
1574
+ if self.log_stats:
1575
+ assert self.connector_prefix_cache_stats is not None
1576
+ self.connector_prefix_cache_stats.reset = True
1577
+
1578
+ return True
1579
+
1580
+ def make_stats(
1581
+ self,
1582
+ spec_decoding_stats: SpecDecodingStats | None = None,
1583
+ kv_connector_stats: KVConnectorStats | None = None,
1584
+ cudagraph_stats: CUDAGraphStat | None = None,
1585
+ perf_stats: PerfStats | None = None,
1586
+ ) -> SchedulerStats | None:
1587
+ if not self.log_stats:
1588
+ return None
1589
+ prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
1590
+ assert prefix_cache_stats is not None
1591
+ connector_prefix_cache_stats = self._make_connector_prefix_cache_stats()
1592
+ eviction_events = (
1593
+ self.kv_metrics_collector.drain_events()
1594
+ if self.kv_metrics_collector is not None
1595
+ else []
1596
+ )
1597
+ spec_stats = spec_decoding_stats
1598
+ connector_stats_payload = (
1599
+ kv_connector_stats.data if kv_connector_stats else None
1600
+ )
1601
+ return SchedulerStats(
1602
+ num_running_reqs=len(self.running),
1603
+ num_waiting_reqs=len(self.waiting),
1604
+ kv_cache_usage=self.kv_cache_manager.usage,
1605
+ prefix_cache_stats=prefix_cache_stats,
1606
+ connector_prefix_cache_stats=connector_prefix_cache_stats,
1607
+ kv_cache_eviction_events=eviction_events,
1608
+ spec_decoding_stats=spec_stats,
1609
+ kv_connector_stats=connector_stats_payload,
1610
+ cudagraph_stats=cudagraph_stats,
1611
+ perf_stats=perf_stats,
1612
+ )
1613
+
1614
+ def make_spec_decoding_stats(
1615
+ self,
1616
+ spec_decoding_stats: SpecDecodingStats | None,
1617
+ num_draft_tokens: int,
1618
+ num_accepted_tokens: int,
1619
+ num_invalid_spec_tokens: dict[str, int] | None,
1620
+ request_id: str,
1621
+ ) -> SpecDecodingStats | None:
1622
+ if not self.log_stats or not num_draft_tokens:
1623
+ return None
1624
+ if spec_decoding_stats is None:
1625
+ spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
1626
+ if num_invalid_spec_tokens:
1627
+ num_draft_tokens -= num_invalid_spec_tokens.get(request_id, 0)
1628
+ spec_decoding_stats.observe_draft(
1629
+ num_draft_tokens=num_draft_tokens, num_accepted_tokens=num_accepted_tokens
1630
+ )
1631
+ return spec_decoding_stats
1632
+
1633
+ def shutdown(self) -> None:
1634
+ if self.kv_event_publisher:
1635
+ self.kv_event_publisher.shutdown()
1636
+ if self.connector is not None:
1637
+ self.connector.shutdown()
1638
+
1639
+ ########################################################################
1640
+ # KV Connector Related Methods
1641
+ ########################################################################
1642
+
1643
+ def _update_connector_prefix_cache_stats(self, request: Request) -> None:
1644
+ if self.connector_prefix_cache_stats is None:
1645
+ return
1646
+
1647
+ self.connector_prefix_cache_stats.record(
1648
+ num_tokens=request.num_tokens,
1649
+ num_hits=request.num_external_computed_tokens,
1650
+ preempted=request.num_preemptions > 0,
1651
+ )
1652
+
1653
+ def _make_connector_prefix_cache_stats(self) -> PrefixCacheStats | None:
1654
+ if self.connector_prefix_cache_stats is None:
1655
+ return None
1656
+ stats = self.connector_prefix_cache_stats
1657
+ self.connector_prefix_cache_stats = PrefixCacheStats()
1658
+ return stats
1659
+
1660
+ def get_kv_connector(self) -> KVConnectorBase_V1 | None:
1661
+ return self.connector
1662
+
1663
+ def _connector_finished(
1664
+ self, request: Request
1665
+ ) -> tuple[bool, dict[str, Any] | None]:
1666
+ """
1667
+ Invoke the KV connector request_finished() method if applicable.
1668
+
1669
+ Returns optional kv transfer parameters to be included with the
1670
+ request outputs.
1671
+ """
1672
+ if self.connector is None:
1673
+ return False, None
1674
+
1675
+ # Free any out-of-window prefix blocks before we hand the block table to
1676
+ # the connector.
1677
+ self.kv_cache_manager.remove_skipped_blocks(
1678
+ request_id=request.request_id,
1679
+ total_computed_tokens=request.num_tokens,
1680
+ )
1681
+
1682
+ block_ids = self.kv_cache_manager.get_block_ids(request.request_id)
1683
+
1684
+ if not isinstance(self.connector, SupportsHMA):
1685
+ # NOTE(Kuntai): We should deprecate this code path after we enforce
1686
+ # all connectors to support HMA.
1687
+ # Hybrid memory allocator should be already turned off for this
1688
+ # code path, but let's double-check here.
1689
+ assert len(self.kv_cache_config.kv_cache_groups) == 1
1690
+ return self.connector.request_finished(request, block_ids[0])
1691
+
1692
+ return self.connector.request_finished_all_groups(request, block_ids)
1693
+
1694
+ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
1695
+ """
1696
+ KV Connector: check if the request_id is finished_recving.
1697
+
1698
+ The finished_recving_kv_req_ids list is populated
1699
+ on the previous steps()'s update_from_output based
1700
+ on the worker side connector.
1701
+
1702
+ When the kv transfer is ready, we cache the blocks
1703
+ and the request state will be moved back to WAITING from
1704
+ WAITING_FOR_REMOTE_KV.
1705
+ """
1706
+ assert self.connector is not None
1707
+ if request.request_id not in self.finished_recving_kv_req_ids:
1708
+ return False
1709
+
1710
+ if request.request_id in self.failed_recving_kv_req_ids:
1711
+ # Request had KV load failures; num_computed_tokens was already
1712
+ # updated in _update_requests_with_invalid_blocks
1713
+ if request.num_computed_tokens:
1714
+ # Cache any valid computed tokens.
1715
+ self.kv_cache_manager.cache_blocks(request, request.num_computed_tokens)
1716
+ else:
1717
+ # No valid computed tokens, release allocated blocks.
1718
+ # There may be a local cache hit on retry.
1719
+ self.kv_cache_manager.free(request)
1720
+
1721
+ self.failed_recving_kv_req_ids.remove(request.request_id)
1722
+ else:
1723
+ # Now that the blocks are ready, actually cache them.
1724
+ (block_ids,) = self.kv_cache_manager.get_block_ids(request.request_id)
1725
+ num_computed_tokens = len(block_ids) * self.block_size
1726
+ # Handle the case where num request tokens less than one block.
1727
+ num_computed_tokens = min(num_computed_tokens, request.num_tokens)
1728
+ if num_computed_tokens == request.num_tokens:
1729
+ num_computed_tokens -= 1
1730
+ # This will cache the blocks iff caching is enabled.
1731
+ self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
1732
+
1733
+ # Update the request state for scheduling.
1734
+ request.num_computed_tokens = num_computed_tokens
1735
+
1736
+ # Return that we are ready.
1737
+ self.finished_recving_kv_req_ids.remove(request.request_id)
1738
+ return True
1739
+
1740
+ def _update_from_kv_xfer_finished(self, kv_connector_output: KVConnectorOutput):
1741
+ """
1742
+ KV Connector: update the scheduler state based on the output.
1743
+
1744
+ The Worker side connectors add finished_recving and
1745
+ finished_sending reqs to the output.
1746
+ * if finished_sending: free the blocks
1747
+ # if finished_recving: add to state so we can
1748
+ schedule the request during the next step.
1749
+ """
1750
+
1751
+ if self.connector is not None:
1752
+ self.connector.update_connector_output(kv_connector_output)
1753
+
1754
+ # KV Connector:: update recv and send status from last step.
1755
+ for req_id in kv_connector_output.finished_recving or ():
1756
+ logger.debug("Finished recving KV transfer for request %s", req_id)
1757
+ self.finished_recving_kv_req_ids.add(req_id)
1758
+ for req_id in kv_connector_output.finished_sending or ():
1759
+ logger.debug("Finished sending KV transfer for request %s", req_id)
1760
+ assert req_id in self.requests
1761
+ self._free_blocks(self.requests[req_id])
1762
+
1763
+ def _update_requests_with_invalid_blocks(
1764
+ self,
1765
+ requests: Iterable[Request],
1766
+ invalid_block_ids: set[int],
1767
+ evict_blocks: bool = True,
1768
+ ) -> tuple[set[str], int, set[int]]:
1769
+ """
1770
+ Identify and update requests affected by invalid KV cache blocks.
1771
+
1772
+ This method scans the given requests, detects those with invalid blocks
1773
+ and adjusts their `num_computed_tokens` to the longest valid prefix.
1774
+ For observability, it also accumulates the total number of tokens that
1775
+ will need to be recomputed across all affected requests.
1776
+
1777
+ Args:
1778
+ requests: The set of requests to scan for invalid blocks.
1779
+ invalid_block_ids: IDs of invalid blocks.
1780
+ evict_blocks: Whether to collect blocks for eviction (False for
1781
+ async requests which aren't cached yet).
1782
+
1783
+ Returns:
1784
+ tuple:
1785
+ - affected_req_ids (set[str]): IDs of requests impacted by
1786
+ invalid blocks.
1787
+ - total_affected_tokens (int): Total number of tokens that must
1788
+ be recomputed across all affected requests.
1789
+ - blocks_to_evict (set[int]): Block IDs to evict from cache,
1790
+ including invalid blocks and downstream dependent blocks.
1791
+ """
1792
+ affected_req_ids: set[str] = set()
1793
+ total_affected_tokens = 0
1794
+ blocks_to_evict: set[int] = set()
1795
+ # If a block is invalid and shared by multiple requests in the batch,
1796
+ # these requests must be rescheduled, but only the first will recompute
1797
+ # it. This set tracks blocks already marked for recomputation.
1798
+ marked_invalid_block_ids: set[int] = set()
1799
+ for request in requests:
1800
+ is_affected = False
1801
+ marked_invalid_block = False
1802
+ req_id = request.request_id
1803
+ # TODO (davidb): add support for hybrid memory allocator
1804
+ (req_block_ids,) = self.kv_cache_manager.get_block_ids(req_id)
1805
+ # We iterate only over blocks that may contain externally computed
1806
+ # tokens
1807
+ if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
1808
+ # Async loading. If num_computed_tokens is set it implies we
1809
+ # already processed some block failures for it in a prior step
1810
+ req_num_computed_tokens = (
1811
+ request.num_computed_tokens
1812
+ if req_id in self.failed_recving_kv_req_ids
1813
+ else len(req_block_ids) * self.block_size
1814
+ )
1815
+ else:
1816
+ # Sync loading. num_computed_tokens includes new tokens
1817
+ req_num_computed_tokens = request.num_cached_tokens
1818
+
1819
+ req_num_computed_blocks = (
1820
+ req_num_computed_tokens + self.block_size - 1
1821
+ ) // self.block_size
1822
+ for idx, block_id in zip(range(req_num_computed_blocks), req_block_ids):
1823
+ if block_id not in invalid_block_ids:
1824
+ continue
1825
+
1826
+ is_affected = True
1827
+
1828
+ if block_id in marked_invalid_block_ids:
1829
+ # This invalid block is shared with a previous request
1830
+ # and was already marked for recomputation.
1831
+ # This means this request can still consider this block
1832
+ # as computed when rescheduled.
1833
+ # Currently this only applies to sync loading; Async
1834
+ # loading does not yet support block sharing
1835
+ continue
1836
+
1837
+ marked_invalid_block_ids.add(block_id)
1838
+
1839
+ if marked_invalid_block:
1840
+ # This request has already marked an invalid block for
1841
+ # recomputation and updated its num_computed_tokens.
1842
+ continue
1843
+
1844
+ marked_invalid_block = True
1845
+ # Truncate the computed tokens at the first failed block
1846
+ request.num_computed_tokens = idx * self.block_size
1847
+ num_affected_tokens = (
1848
+ req_num_computed_tokens - request.num_computed_tokens
1849
+ )
1850
+ total_affected_tokens += num_affected_tokens
1851
+ request.num_external_computed_tokens -= num_affected_tokens
1852
+ # collect invalid block and all downstream dependent blocks
1853
+ if evict_blocks:
1854
+ blocks_to_evict.update(req_block_ids[idx:])
1855
+
1856
+ if is_affected:
1857
+ if not marked_invalid_block:
1858
+ # All invalid blocks of this request are shared with
1859
+ # previous requests and will be recomputed by them.
1860
+ # Revert to considering only cached tokens as computed.
1861
+ # Currently this only applies to sync loading; Async
1862
+ # loading does not yet support block sharing
1863
+ total_affected_tokens += (
1864
+ request.num_computed_tokens - request.num_cached_tokens
1865
+ )
1866
+ request.num_computed_tokens = request.num_cached_tokens
1867
+
1868
+ affected_req_ids.add(request.request_id)
1869
+
1870
+ return affected_req_ids, total_affected_tokens, blocks_to_evict
1871
+
1872
+ def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]:
1873
+ """
1874
+ Handle requests affected by invalid KV cache blocks.
1875
+
1876
+ Returns:
1877
+ Set of affected request IDs to skip in update_from_output main loop.
1878
+ """
1879
+ should_fail = not self.recompute_kv_load_failures
1880
+
1881
+ # handle async KV loads (not cached yet, evict_blocks=False)
1882
+ async_load_reqs = (
1883
+ req
1884
+ for req in self.waiting
1885
+ if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
1886
+ )
1887
+ async_failed_req_ids, num_failed_tokens, _ = (
1888
+ self._update_requests_with_invalid_blocks(
1889
+ async_load_reqs, invalid_block_ids, evict_blocks=False
1890
+ )
1891
+ )
1892
+
1893
+ total_failed_requests = len(async_failed_req_ids)
1894
+ total_failed_tokens = num_failed_tokens
1895
+
1896
+ # handle sync loads (may be cached, collect blocks for eviction)
1897
+ sync_failed_req_ids, num_failed_tokens, sync_blocks_to_evict = (
1898
+ self._update_requests_with_invalid_blocks(
1899
+ self.running, invalid_block_ids, evict_blocks=True
1900
+ )
1901
+ )
1902
+
1903
+ total_failed_requests += len(sync_failed_req_ids)
1904
+ total_failed_tokens += num_failed_tokens
1905
+
1906
+ if not total_failed_requests:
1907
+ return set()
1908
+
1909
+ # evict invalid blocks and downstream dependent blocks from cache
1910
+ # only when not using recompute policy (where blocks will be recomputed
1911
+ # and reused by other requests sharing them)
1912
+ if sync_blocks_to_evict and not self.recompute_kv_load_failures:
1913
+ self.kv_cache_manager.evict_blocks(sync_blocks_to_evict)
1914
+
1915
+ if should_fail:
1916
+ all_failed_req_ids = async_failed_req_ids | sync_failed_req_ids
1917
+ logger.error(
1918
+ "Failing %d request(s) due to KV load failure "
1919
+ "(failure_policy=fail, %d tokens affected). Request IDs: %s",
1920
+ total_failed_requests,
1921
+ total_failed_tokens,
1922
+ all_failed_req_ids,
1923
+ )
1924
+ return all_failed_req_ids
1925
+
1926
+ logger.warning(
1927
+ "Recovered from KV load failure: "
1928
+ "%d request(s) rescheduled (%d tokens affected).",
1929
+ total_failed_requests,
1930
+ total_failed_tokens,
1931
+ )
1932
+
1933
+ # Mark async requests with KV load failures for retry once loading completes
1934
+ self.failed_recving_kv_req_ids |= async_failed_req_ids
1935
+ # Return sync affected IDs to skip in update_from_output
1936
+ return sync_failed_req_ids