vllm-cpu 0.9.2.post2__cp311-cp311-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1236) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +214 -0
  3. vllm/_custom_ops.py +1915 -0
  4. vllm/_ipex_ops.py +350 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +139 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +325 -0
  20. vllm/attention/backends/blocksparse_attn.py +465 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1506 -0
  23. vllm/attention/backends/flash_attn.py +1008 -0
  24. vllm/attention/backends/flashinfer.py +1107 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +318 -0
  27. vllm/attention/backends/ipex_attn.py +403 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1391 -0
  30. vllm/attention/backends/pallas.py +356 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +1015 -0
  34. vllm/attention/backends/torch_sdpa.py +707 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +807 -0
  38. vllm/attention/layer.py +481 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +903 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/pallas_kv_cache_update.py +120 -0
  52. vllm/attention/ops/prefix_prefill.py +902 -0
  53. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  54. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  55. vllm/attention/ops/triton_decode_attention.py +674 -0
  56. vllm/attention/ops/triton_flash_attention.py +984 -0
  57. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  58. vllm/attention/ops/triton_unified_attention.py +738 -0
  59. vllm/attention/selector.py +214 -0
  60. vllm/attention/utils/fa_utils.py +72 -0
  61. vllm/beam_search.py +87 -0
  62. vllm/benchmarks/__init__.py +0 -0
  63. vllm/benchmarks/datasets.py +1441 -0
  64. vllm/benchmarks/endpoint_request_func.py +393 -0
  65. vllm/benchmarks/latency.py +168 -0
  66. vllm/benchmarks/serve.py +1063 -0
  67. vllm/benchmarks/throughput.py +609 -0
  68. vllm/benchmarks/utils.py +70 -0
  69. vllm/collect_env.py +820 -0
  70. vllm/compilation/__init__.py +0 -0
  71. vllm/compilation/activation_quant_fusion.py +89 -0
  72. vllm/compilation/backends.py +610 -0
  73. vllm/compilation/base_piecewise_backend.py +72 -0
  74. vllm/compilation/collective_fusion.py +127 -0
  75. vllm/compilation/compiler_interface.py +564 -0
  76. vllm/compilation/counter.py +41 -0
  77. vllm/compilation/cuda_piecewise_backend.py +218 -0
  78. vllm/compilation/decorators.py +250 -0
  79. vllm/compilation/fix_functionalization.py +191 -0
  80. vllm/compilation/fusion.py +645 -0
  81. vllm/compilation/fusion_attn.py +166 -0
  82. vllm/compilation/fx_utils.py +84 -0
  83. vllm/compilation/inductor_pass.py +115 -0
  84. vllm/compilation/monitor.py +39 -0
  85. vllm/compilation/multi_output_match.py +109 -0
  86. vllm/compilation/noop_elimination.py +165 -0
  87. vllm/compilation/pass_manager.py +82 -0
  88. vllm/compilation/sequence_parallelism.py +482 -0
  89. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  90. vllm/compilation/vllm_inductor_pass.py +70 -0
  91. vllm/compilation/wrapper.py +135 -0
  92. vllm/config.py +4913 -0
  93. vllm/connections.py +174 -0
  94. vllm/core/__init__.py +0 -0
  95. vllm/core/block/__init__.py +0 -0
  96. vllm/core/block/block_table.py +399 -0
  97. vllm/core/block/common.py +371 -0
  98. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  99. vllm/core/block/interfaces.py +319 -0
  100. vllm/core/block/naive_block.py +466 -0
  101. vllm/core/block/prefix_caching_block.py +1135 -0
  102. vllm/core/block/utils.py +28 -0
  103. vllm/core/block_manager.py +525 -0
  104. vllm/core/evictor.py +157 -0
  105. vllm/core/interfaces.py +139 -0
  106. vllm/core/placeholder_block_space_manager.py +103 -0
  107. vllm/core/scheduler.py +2126 -0
  108. vllm/device_allocator/__init__.py +0 -0
  109. vllm/device_allocator/cumem.py +281 -0
  110. vllm/distributed/__init__.py +6 -0
  111. vllm/distributed/communication_op.py +41 -0
  112. vllm/distributed/device_communicators/__init__.py +0 -0
  113. vllm/distributed/device_communicators/all2all.py +264 -0
  114. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  115. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  116. vllm/distributed/device_communicators/cuda_communicator.py +194 -0
  117. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  118. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  119. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  120. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  121. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  122. vllm/distributed/device_communicators/pynccl.py +218 -0
  123. vllm/distributed/device_communicators/pynccl_wrapper.py +349 -0
  124. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  125. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  126. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  127. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  128. vllm/distributed/eplb/__init__.py +8 -0
  129. vllm/distributed/eplb/eplb_state.py +432 -0
  130. vllm/distributed/eplb/rebalance_algo.py +234 -0
  131. vllm/distributed/eplb/rebalance_execute.py +307 -0
  132. vllm/distributed/kv_events.py +356 -0
  133. vllm/distributed/kv_transfer/README.md +29 -0
  134. vllm/distributed/kv_transfer/__init__.py +12 -0
  135. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  137. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  138. vllm/distributed/kv_transfer/kv_connector/factory.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  140. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  141. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  142. vllm/distributed/kv_transfer/kv_connector/utils.py +109 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1103 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +485 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +533 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +265 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +389 -0
  153. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  154. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  155. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  156. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  158. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  159. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  160. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  161. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  162. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  163. vllm/distributed/parallel_state.py +1385 -0
  164. vllm/distributed/tpu_distributed_utils.py +178 -0
  165. vllm/distributed/utils.py +536 -0
  166. vllm/engine/__init__.py +0 -0
  167. vllm/engine/arg_utils.py +1801 -0
  168. vllm/engine/async_llm_engine.py +1200 -0
  169. vllm/engine/async_timeout.py +173 -0
  170. vllm/engine/llm_engine.py +2101 -0
  171. vllm/engine/metrics.py +629 -0
  172. vllm/engine/metrics_types.py +94 -0
  173. vllm/engine/multiprocessing/__init__.py +148 -0
  174. vllm/engine/multiprocessing/client.py +681 -0
  175. vllm/engine/multiprocessing/engine.py +460 -0
  176. vllm/engine/output_processor/__init__.py +0 -0
  177. vllm/engine/output_processor/interfaces.py +75 -0
  178. vllm/engine/output_processor/multi_step.py +216 -0
  179. vllm/engine/output_processor/single_step.py +145 -0
  180. vllm/engine/output_processor/stop_checker.py +131 -0
  181. vllm/engine/output_processor/util.py +28 -0
  182. vllm/engine/protocol.py +326 -0
  183. vllm/entrypoints/__init__.py +0 -0
  184. vllm/entrypoints/api_server.py +178 -0
  185. vllm/entrypoints/chat_utils.py +1278 -0
  186. vllm/entrypoints/cli/__init__.py +12 -0
  187. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  188. vllm/entrypoints/cli/benchmark/base.py +25 -0
  189. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  190. vllm/entrypoints/cli/benchmark/main.py +58 -0
  191. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  192. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  193. vllm/entrypoints/cli/collect_env.py +36 -0
  194. vllm/entrypoints/cli/main.py +71 -0
  195. vllm/entrypoints/cli/openai.py +201 -0
  196. vllm/entrypoints/cli/run_batch.py +69 -0
  197. vllm/entrypoints/cli/serve.py +265 -0
  198. vllm/entrypoints/cli/types.py +29 -0
  199. vllm/entrypoints/launcher.py +147 -0
  200. vllm/entrypoints/llm.py +1599 -0
  201. vllm/entrypoints/logger.py +50 -0
  202. vllm/entrypoints/openai/__init__.py +0 -0
  203. vllm/entrypoints/openai/api_server.py +1495 -0
  204. vllm/entrypoints/openai/cli_args.py +331 -0
  205. vllm/entrypoints/openai/logits_processors.py +90 -0
  206. vllm/entrypoints/openai/protocol.py +2096 -0
  207. vllm/entrypoints/openai/run_batch.py +473 -0
  208. vllm/entrypoints/openai/serving_chat.py +1258 -0
  209. vllm/entrypoints/openai/serving_classification.py +160 -0
  210. vllm/entrypoints/openai/serving_completion.py +618 -0
  211. vllm/entrypoints/openai/serving_embedding.py +201 -0
  212. vllm/entrypoints/openai/serving_engine.py +988 -0
  213. vllm/entrypoints/openai/serving_models.py +315 -0
  214. vllm/entrypoints/openai/serving_pooling.py +234 -0
  215. vllm/entrypoints/openai/serving_score.py +431 -0
  216. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  217. vllm/entrypoints/openai/serving_transcription.py +132 -0
  218. vllm/entrypoints/openai/speech_to_text.py +395 -0
  219. vllm/entrypoints/openai/tool_parsers/__init__.py +25 -0
  220. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  221. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  222. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  223. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  224. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  225. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  226. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  227. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  228. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  229. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +369 -0
  230. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  231. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  232. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  233. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  234. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +466 -0
  235. vllm/entrypoints/score_utils.py +50 -0
  236. vllm/entrypoints/ssl.py +75 -0
  237. vllm/entrypoints/utils.py +262 -0
  238. vllm/env_override.py +41 -0
  239. vllm/envs.py +1029 -0
  240. vllm/executor/__init__.py +0 -0
  241. vllm/executor/executor_base.py +401 -0
  242. vllm/executor/mp_distributed_executor.py +244 -0
  243. vllm/executor/msgspec_utils.py +30 -0
  244. vllm/executor/multiproc_worker_utils.py +313 -0
  245. vllm/executor/ray_distributed_executor.py +701 -0
  246. vllm/executor/ray_utils.py +399 -0
  247. vllm/executor/uniproc_executor.py +139 -0
  248. vllm/forward_context.py +185 -0
  249. vllm/inputs/__init__.py +41 -0
  250. vllm/inputs/data.py +331 -0
  251. vllm/inputs/parse.py +151 -0
  252. vllm/inputs/preprocess.py +924 -0
  253. vllm/inputs/registry.py +245 -0
  254. vllm/jsontree.py +80 -0
  255. vllm/logger.py +212 -0
  256. vllm/logging_utils/__init__.py +8 -0
  257. vllm/logging_utils/dump_input.py +81 -0
  258. vllm/logging_utils/formatter.py +18 -0
  259. vllm/logits_process.py +119 -0
  260. vllm/lora/__init__.py +0 -0
  261. vllm/lora/fully_sharded_layers.py +355 -0
  262. vllm/lora/layers.py +1285 -0
  263. vllm/lora/lora.py +199 -0
  264. vllm/lora/models.py +818 -0
  265. vllm/lora/ops/__init__.py +0 -0
  266. vllm/lora/ops/torch_ops/__init__.py +16 -0
  267. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  268. vllm/lora/ops/triton_ops/__init__.py +12 -0
  269. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  270. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  271. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  272. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  273. vllm/lora/ops/triton_ops/utils.py +120 -0
  274. vllm/lora/ops/xla_ops/__init__.py +7 -0
  275. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  276. vllm/lora/peft_helper.py +136 -0
  277. vllm/lora/punica_wrapper/__init__.py +10 -0
  278. vllm/lora/punica_wrapper/punica_base.py +485 -0
  279. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  280. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  281. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  284. vllm/lora/punica_wrapper/utils.py +164 -0
  285. vllm/lora/request.py +99 -0
  286. vllm/lora/resolver.py +85 -0
  287. vllm/lora/utils.py +240 -0
  288. vllm/lora/worker_manager.py +256 -0
  289. vllm/model_executor/__init__.py +16 -0
  290. vllm/model_executor/custom_op.py +208 -0
  291. vllm/model_executor/guided_decoding/__init__.py +181 -0
  292. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  293. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  294. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  295. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  296. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  297. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  298. vllm/model_executor/guided_decoding/utils.py +242 -0
  299. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  300. vllm/model_executor/layers/__init__.py +0 -0
  301. vllm/model_executor/layers/activation.py +420 -0
  302. vllm/model_executor/layers/fused_moe/__init__.py +78 -0
  303. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +298 -0
  304. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +140 -0
  305. vllm/model_executor/layers/fused_moe/config.py +456 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  475. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +215 -0
  476. vllm/model_executor/layers/fused_moe/cutlass_moe.py +645 -0
  477. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +250 -0
  478. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +231 -0
  479. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +183 -0
  480. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1021 -0
  481. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +234 -0
  482. vllm/model_executor/layers/fused_moe/fused_moe.py +1734 -0
  483. vllm/model_executor/layers/fused_moe/layer.py +1528 -0
  484. vllm/model_executor/layers/fused_moe/modular_kernel.py +598 -0
  485. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +224 -0
  486. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  487. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  488. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  489. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +233 -0
  490. vllm/model_executor/layers/fused_moe/prepare_finalize.py +66 -0
  491. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +429 -0
  492. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +136 -0
  493. vllm/model_executor/layers/fused_moe/utils.py +144 -0
  494. vllm/model_executor/layers/layernorm.py +287 -0
  495. vllm/model_executor/layers/lightning_attn.py +652 -0
  496. vllm/model_executor/layers/linear.py +1547 -0
  497. vllm/model_executor/layers/logits_processor.py +197 -0
  498. vllm/model_executor/layers/mamba/__init__.py +0 -0
  499. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  500. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  501. vllm/model_executor/layers/mamba/mamba_mixer2.py +731 -0
  502. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  503. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  504. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  505. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  506. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  507. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  508. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  509. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  510. vllm/model_executor/layers/pooler.py +473 -0
  511. vllm/model_executor/layers/quantization/__init__.py +160 -0
  512. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  513. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  514. vllm/model_executor/layers/quantization/awq.py +228 -0
  515. vllm/model_executor/layers/quantization/awq_marlin.py +523 -0
  516. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  517. vllm/model_executor/layers/quantization/base_config.py +164 -0
  518. vllm/model_executor/layers/quantization/bitblas.py +462 -0
  519. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  520. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  521. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +694 -0
  522. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1613 -0
  523. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  524. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  525. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  526. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  527. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  528. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +149 -0
  529. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  530. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  531. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  532. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  533. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  534. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  535. vllm/model_executor/layers/quantization/deepgemm.py +83 -0
  536. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  537. vllm/model_executor/layers/quantization/experts_int8.py +204 -0
  538. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  539. vllm/model_executor/layers/quantization/fp8.py +950 -0
  540. vllm/model_executor/layers/quantization/gguf.py +577 -0
  541. vllm/model_executor/layers/quantization/gptq.py +278 -0
  542. vllm/model_executor/layers/quantization/gptq_bitblas.py +446 -0
  543. vllm/model_executor/layers/quantization/gptq_marlin.py +679 -0
  544. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  545. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  546. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  547. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  548. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  549. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  550. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  551. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  552. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  553. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +132 -0
  554. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  555. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  556. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  557. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  558. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  559. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  560. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  561. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  562. vllm/model_executor/layers/quantization/marlin.py +263 -0
  563. vllm/model_executor/layers/quantization/modelopt.py +747 -0
  564. vllm/model_executor/layers/quantization/moe_wna16.py +457 -0
  565. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  566. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  567. vllm/model_executor/layers/quantization/qqq.py +275 -0
  568. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  569. vllm/model_executor/layers/quantization/quark/quark.py +437 -0
  570. vllm/model_executor/layers/quantization/quark/quark_moe.py +245 -0
  571. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  572. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  573. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  574. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +157 -0
  575. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  576. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  577. vllm/model_executor/layers/quantization/rtn.py +289 -0
  578. vllm/model_executor/layers/quantization/schema.py +86 -0
  579. vllm/model_executor/layers/quantization/torchao.py +212 -0
  580. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  581. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  582. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  583. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/fp8_utils.py +653 -0
  787. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  788. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  789. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  790. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  791. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  792. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  793. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  794. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  795. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  796. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  797. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  798. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +146 -0
  799. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  800. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  801. vllm/model_executor/layers/rejection_sampler.py +406 -0
  802. vllm/model_executor/layers/resampler.py +270 -0
  803. vllm/model_executor/layers/rotary_embedding.py +2025 -0
  804. vllm/model_executor/layers/sampler.py +1204 -0
  805. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  806. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  807. vllm/model_executor/layers/utils.py +116 -0
  808. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  809. vllm/model_executor/model_loader/__init__.py +77 -0
  810. vllm/model_executor/model_loader/base_loader.py +43 -0
  811. vllm/model_executor/model_loader/bitsandbytes_loader.py +613 -0
  812. vllm/model_executor/model_loader/default_loader.py +282 -0
  813. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  814. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  815. vllm/model_executor/model_loader/neuron.py +476 -0
  816. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  817. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  818. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  819. vllm/model_executor/model_loader/tensorizer.py +602 -0
  820. vllm/model_executor/model_loader/tensorizer_loader.py +127 -0
  821. vllm/model_executor/model_loader/tpu.py +113 -0
  822. vllm/model_executor/model_loader/utils.py +315 -0
  823. vllm/model_executor/model_loader/weight_utils.py +782 -0
  824. vllm/model_executor/models/__init__.py +30 -0
  825. vllm/model_executor/models/adapters.py +375 -0
  826. vllm/model_executor/models/aimv2.py +246 -0
  827. vllm/model_executor/models/arctic.py +559 -0
  828. vllm/model_executor/models/aria.py +670 -0
  829. vllm/model_executor/models/aya_vision.py +486 -0
  830. vllm/model_executor/models/baichuan.py +474 -0
  831. vllm/model_executor/models/bamba.py +558 -0
  832. vllm/model_executor/models/bart.py +938 -0
  833. vllm/model_executor/models/bert.py +513 -0
  834. vllm/model_executor/models/bert_with_rope.py +617 -0
  835. vllm/model_executor/models/blip.py +339 -0
  836. vllm/model_executor/models/blip2.py +728 -0
  837. vllm/model_executor/models/bloom.py +373 -0
  838. vllm/model_executor/models/chameleon.py +1146 -0
  839. vllm/model_executor/models/chatglm.py +478 -0
  840. vllm/model_executor/models/clip.py +407 -0
  841. vllm/model_executor/models/commandr.py +471 -0
  842. vllm/model_executor/models/config.py +200 -0
  843. vllm/model_executor/models/constant_size_cache.py +137 -0
  844. vllm/model_executor/models/dbrx.py +472 -0
  845. vllm/model_executor/models/deepseek.py +486 -0
  846. vllm/model_executor/models/deepseek_mtp.py +281 -0
  847. vllm/model_executor/models/deepseek_v2.py +935 -0
  848. vllm/model_executor/models/deepseek_vl2.py +660 -0
  849. vllm/model_executor/models/dots1.py +536 -0
  850. vllm/model_executor/models/eagle.py +261 -0
  851. vllm/model_executor/models/ernie45.py +43 -0
  852. vllm/model_executor/models/ernie45_moe.py +583 -0
  853. vllm/model_executor/models/exaone.py +551 -0
  854. vllm/model_executor/models/fairseq2_llama.py +154 -0
  855. vllm/model_executor/models/falcon.py +510 -0
  856. vllm/model_executor/models/falcon_h1.py +708 -0
  857. vllm/model_executor/models/florence2.py +1113 -0
  858. vllm/model_executor/models/fuyu.py +406 -0
  859. vllm/model_executor/models/gemma.py +427 -0
  860. vllm/model_executor/models/gemma2.py +427 -0
  861. vllm/model_executor/models/gemma3.py +535 -0
  862. vllm/model_executor/models/gemma3_mm.py +729 -0
  863. vllm/model_executor/models/gemma3n.py +811 -0
  864. vllm/model_executor/models/glm.py +23 -0
  865. vllm/model_executor/models/glm4.py +305 -0
  866. vllm/model_executor/models/glm4_1v.py +1590 -0
  867. vllm/model_executor/models/glm4v.py +657 -0
  868. vllm/model_executor/models/gpt2.py +382 -0
  869. vllm/model_executor/models/gpt_bigcode.py +335 -0
  870. vllm/model_executor/models/gpt_j.py +339 -0
  871. vllm/model_executor/models/gpt_neox.py +332 -0
  872. vllm/model_executor/models/granite.py +493 -0
  873. vllm/model_executor/models/granite_speech.py +790 -0
  874. vllm/model_executor/models/granitemoe.py +437 -0
  875. vllm/model_executor/models/granitemoehybrid.py +653 -0
  876. vllm/model_executor/models/granitemoeshared.py +341 -0
  877. vllm/model_executor/models/gritlm.py +224 -0
  878. vllm/model_executor/models/grok1.py +546 -0
  879. vllm/model_executor/models/h2ovl.py +549 -0
  880. vllm/model_executor/models/hunyuan_v1_moe.py +897 -0
  881. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  882. vllm/model_executor/models/idefics3.py +786 -0
  883. vllm/model_executor/models/interfaces.py +681 -0
  884. vllm/model_executor/models/interfaces_base.py +164 -0
  885. vllm/model_executor/models/intern_vit.py +480 -0
  886. vllm/model_executor/models/internlm2.py +455 -0
  887. vllm/model_executor/models/internlm2_ve.py +147 -0
  888. vllm/model_executor/models/internvl.py +1432 -0
  889. vllm/model_executor/models/jais.py +373 -0
  890. vllm/model_executor/models/jamba.py +592 -0
  891. vllm/model_executor/models/keye.py +1736 -0
  892. vllm/model_executor/models/kimi_vl.py +585 -0
  893. vllm/model_executor/models/llama.py +644 -0
  894. vllm/model_executor/models/llama4.py +531 -0
  895. vllm/model_executor/models/llama_eagle.py +165 -0
  896. vllm/model_executor/models/llama_eagle3.py +263 -0
  897. vllm/model_executor/models/llava.py +887 -0
  898. vllm/model_executor/models/llava_next.py +604 -0
  899. vllm/model_executor/models/llava_next_video.py +492 -0
  900. vllm/model_executor/models/llava_onevision.py +985 -0
  901. vllm/model_executor/models/mamba.py +273 -0
  902. vllm/model_executor/models/mamba2.py +320 -0
  903. vllm/model_executor/models/mamba_cache.py +76 -0
  904. vllm/model_executor/models/medusa.py +219 -0
  905. vllm/model_executor/models/mimo.py +192 -0
  906. vllm/model_executor/models/mimo_mtp.py +285 -0
  907. vllm/model_executor/models/minicpm.py +592 -0
  908. vllm/model_executor/models/minicpm3.py +230 -0
  909. vllm/model_executor/models/minicpm_eagle.py +391 -0
  910. vllm/model_executor/models/minicpmo.py +772 -0
  911. vllm/model_executor/models/minicpmv.py +1307 -0
  912. vllm/model_executor/models/minimax_cache.py +36 -0
  913. vllm/model_executor/models/minimax_text_01.py +1301 -0
  914. vllm/model_executor/models/minimax_vl_01.py +374 -0
  915. vllm/model_executor/models/mistral3.py +624 -0
  916. vllm/model_executor/models/mixtral.py +488 -0
  917. vllm/model_executor/models/mixtral_quant.py +453 -0
  918. vllm/model_executor/models/mllama.py +1682 -0
  919. vllm/model_executor/models/mllama4.py +947 -0
  920. vllm/model_executor/models/mlp_speculator.py +206 -0
  921. vllm/model_executor/models/modernbert.py +339 -0
  922. vllm/model_executor/models/module_mapping.py +72 -0
  923. vllm/model_executor/models/molmo.py +1576 -0
  924. vllm/model_executor/models/moonvit.py +630 -0
  925. vllm/model_executor/models/mpt.py +331 -0
  926. vllm/model_executor/models/nemotron.py +508 -0
  927. vllm/model_executor/models/nemotron_h.py +588 -0
  928. vllm/model_executor/models/nemotron_nas.py +484 -0
  929. vllm/model_executor/models/nvlm_d.py +216 -0
  930. vllm/model_executor/models/olmo.py +389 -0
  931. vllm/model_executor/models/olmo2.py +414 -0
  932. vllm/model_executor/models/olmoe.py +468 -0
  933. vllm/model_executor/models/opt.py +412 -0
  934. vllm/model_executor/models/orion.py +349 -0
  935. vllm/model_executor/models/ovis.py +577 -0
  936. vllm/model_executor/models/paligemma.py +419 -0
  937. vllm/model_executor/models/persimmon.py +344 -0
  938. vllm/model_executor/models/phi.py +356 -0
  939. vllm/model_executor/models/phi3.py +19 -0
  940. vllm/model_executor/models/phi3_small.py +465 -0
  941. vllm/model_executor/models/phi3v.py +733 -0
  942. vllm/model_executor/models/phi4mm.py +1258 -0
  943. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  944. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  945. vllm/model_executor/models/phimoe.py +674 -0
  946. vllm/model_executor/models/pixtral.py +1329 -0
  947. vllm/model_executor/models/plamo2.py +738 -0
  948. vllm/model_executor/models/prithvi_geospatial_mae.py +240 -0
  949. vllm/model_executor/models/qwen.py +362 -0
  950. vllm/model_executor/models/qwen2.py +501 -0
  951. vllm/model_executor/models/qwen2_5_omni_thinker.py +923 -0
  952. vllm/model_executor/models/qwen2_5_vl.py +1175 -0
  953. vllm/model_executor/models/qwen2_audio.py +420 -0
  954. vllm/model_executor/models/qwen2_moe.py +540 -0
  955. vllm/model_executor/models/qwen2_rm.py +122 -0
  956. vllm/model_executor/models/qwen2_vl.py +1513 -0
  957. vllm/model_executor/models/qwen3.py +325 -0
  958. vllm/model_executor/models/qwen3_moe.py +541 -0
  959. vllm/model_executor/models/qwen_vl.py +796 -0
  960. vllm/model_executor/models/registry.py +634 -0
  961. vllm/model_executor/models/roberta.py +271 -0
  962. vllm/model_executor/models/siglip.py +524 -0
  963. vllm/model_executor/models/skyworkr1v.py +961 -0
  964. vllm/model_executor/models/smolvlm.py +52 -0
  965. vllm/model_executor/models/solar.py +506 -0
  966. vllm/model_executor/models/stablelm.py +343 -0
  967. vllm/model_executor/models/starcoder2.py +356 -0
  968. vllm/model_executor/models/tarsier.py +652 -0
  969. vllm/model_executor/models/telechat2.py +140 -0
  970. vllm/model_executor/models/teleflm.py +79 -0
  971. vllm/model_executor/models/transformers.py +509 -0
  972. vllm/model_executor/models/ultravox.py +670 -0
  973. vllm/model_executor/models/utils.py +744 -0
  974. vllm/model_executor/models/vision.py +147 -0
  975. vllm/model_executor/models/whisper.py +886 -0
  976. vllm/model_executor/models/zamba2.py +1036 -0
  977. vllm/model_executor/parameter.py +459 -0
  978. vllm/model_executor/pooling_metadata.py +72 -0
  979. vllm/model_executor/sampling_metadata.py +597 -0
  980. vllm/model_executor/utils.py +80 -0
  981. vllm/multimodal/__init__.py +33 -0
  982. vllm/multimodal/audio.py +116 -0
  983. vllm/multimodal/base.py +219 -0
  984. vllm/multimodal/hasher.py +91 -0
  985. vllm/multimodal/image.py +103 -0
  986. vllm/multimodal/inputs.py +878 -0
  987. vllm/multimodal/parse.py +499 -0
  988. vllm/multimodal/processing.py +1948 -0
  989. vllm/multimodal/profiling.py +283 -0
  990. vllm/multimodal/registry.py +331 -0
  991. vllm/multimodal/utils.py +492 -0
  992. vllm/multimodal/video.py +227 -0
  993. vllm/outputs.py +516 -0
  994. vllm/platforms/__init__.py +291 -0
  995. vllm/platforms/cpu.py +281 -0
  996. vllm/platforms/cuda.py +568 -0
  997. vllm/platforms/hpu.py +106 -0
  998. vllm/platforms/interface.py +551 -0
  999. vllm/platforms/neuron.py +150 -0
  1000. vllm/platforms/rocm.py +453 -0
  1001. vllm/platforms/tpu.py +206 -0
  1002. vllm/platforms/xpu.py +192 -0
  1003. vllm/plugins/__init__.py +94 -0
  1004. vllm/plugins/lora_resolvers/README.md +15 -0
  1005. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1006. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1007. vllm/pooling_params.py +64 -0
  1008. vllm/profiler/__init__.py +0 -0
  1009. vllm/profiler/layerwise_profile.py +375 -0
  1010. vllm/profiler/utils.py +148 -0
  1011. vllm/prompt_adapter/__init__.py +0 -0
  1012. vllm/prompt_adapter/layers.py +83 -0
  1013. vllm/prompt_adapter/models.py +358 -0
  1014. vllm/prompt_adapter/request.py +37 -0
  1015. vllm/prompt_adapter/utils.py +98 -0
  1016. vllm/prompt_adapter/worker_manager.py +179 -0
  1017. vllm/py.typed +2 -0
  1018. vllm/reasoning/__init__.py +15 -0
  1019. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  1020. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1021. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1022. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1023. vllm/sampling_params.py +602 -0
  1024. vllm/scalar_type.py +347 -0
  1025. vllm/scripts.py +15 -0
  1026. vllm/sequence.py +1568 -0
  1027. vllm/spec_decode/__init__.py +0 -0
  1028. vllm/spec_decode/batch_expansion.py +506 -0
  1029. vllm/spec_decode/draft_model_runner.py +349 -0
  1030. vllm/spec_decode/interfaces.py +99 -0
  1031. vllm/spec_decode/medusa_worker.py +138 -0
  1032. vllm/spec_decode/metrics.py +213 -0
  1033. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1034. vllm/spec_decode/mqa_scorer.py +160 -0
  1035. vllm/spec_decode/multi_step_worker.py +423 -0
  1036. vllm/spec_decode/ngram_worker.py +196 -0
  1037. vllm/spec_decode/proposer_worker_base.py +59 -0
  1038. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1039. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1040. vllm/spec_decode/target_model_runner.py +45 -0
  1041. vllm/spec_decode/top1_proposer.py +275 -0
  1042. vllm/spec_decode/util.py +277 -0
  1043. vllm/test_utils.py +130 -0
  1044. vllm/third_party/__init__.py +0 -0
  1045. vllm/third_party/pynvml.py +6140 -0
  1046. vllm/tracing.py +131 -0
  1047. vllm/transformers_utils/__init__.py +24 -0
  1048. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1049. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1050. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1051. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1052. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1053. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1054. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1055. vllm/transformers_utils/config.py +922 -0
  1056. vllm/transformers_utils/configs/__init__.py +57 -0
  1057. vllm/transformers_utils/configs/arctic.py +207 -0
  1058. vllm/transformers_utils/configs/chatglm.py +72 -0
  1059. vllm/transformers_utils/configs/cohere2.py +195 -0
  1060. vllm/transformers_utils/configs/dbrx.py +280 -0
  1061. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1062. vllm/transformers_utils/configs/eagle.py +85 -0
  1063. vllm/transformers_utils/configs/exaone.py +190 -0
  1064. vllm/transformers_utils/configs/falcon.py +90 -0
  1065. vllm/transformers_utils/configs/jais.py +238 -0
  1066. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1067. vllm/transformers_utils/configs/medusa.py +63 -0
  1068. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1069. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1070. vllm/transformers_utils/configs/mllama.py +31 -0
  1071. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1072. vllm/transformers_utils/configs/moonvit.py +33 -0
  1073. vllm/transformers_utils/configs/mpt.py +180 -0
  1074. vllm/transformers_utils/configs/nemotron.py +205 -0
  1075. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1076. vllm/transformers_utils/configs/nvlm_d.py +31 -0
  1077. vllm/transformers_utils/configs/ovis.py +184 -0
  1078. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1079. vllm/transformers_utils/configs/solar.py +247 -0
  1080. vllm/transformers_utils/configs/telechat2.py +64 -0
  1081. vllm/transformers_utils/configs/ultravox.py +108 -0
  1082. vllm/transformers_utils/detokenizer.py +168 -0
  1083. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1084. vllm/transformers_utils/processor.py +221 -0
  1085. vllm/transformers_utils/processors/__init__.py +8 -0
  1086. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1087. vllm/transformers_utils/processors/ovis.py +420 -0
  1088. vllm/transformers_utils/s3_utils.py +162 -0
  1089. vllm/transformers_utils/tokenizer.py +302 -0
  1090. vllm/transformers_utils/tokenizer_base.py +149 -0
  1091. vllm/transformers_utils/tokenizer_group.py +120 -0
  1092. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1093. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1094. vllm/transformers_utils/utils.py +99 -0
  1095. vllm/triton_utils/__init__.py +14 -0
  1096. vllm/triton_utils/importing.py +94 -0
  1097. vllm/usage/__init__.py +0 -0
  1098. vllm/usage/usage_lib.py +259 -0
  1099. vllm/utils/__init__.py +3008 -0
  1100. vllm/v1/__init__.py +0 -0
  1101. vllm/v1/attention/__init__.py +0 -0
  1102. vllm/v1/attention/backends/__init__.py +0 -0
  1103. vllm/v1/attention/backends/cpu_attn.py +184 -0
  1104. vllm/v1/attention/backends/flash_attn.py +757 -0
  1105. vllm/v1/attention/backends/flashinfer.py +680 -0
  1106. vllm/v1/attention/backends/flex_attention.py +491 -0
  1107. vllm/v1/attention/backends/mamba_attn.py +192 -0
  1108. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1109. vllm/v1/attention/backends/mla/common.py +978 -0
  1110. vllm/v1/attention/backends/mla/cutlass_mla.py +98 -0
  1111. vllm/v1/attention/backends/mla/flashmla.py +180 -0
  1112. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +241 -0
  1113. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1114. vllm/v1/attention/backends/pallas.py +320 -0
  1115. vllm/v1/attention/backends/rocm_aiter_fa.py +609 -0
  1116. vllm/v1/attention/backends/triton_attn.py +449 -0
  1117. vllm/v1/attention/backends/utils.py +310 -0
  1118. vllm/v1/core/__init__.py +0 -0
  1119. vllm/v1/core/block_pool.py +349 -0
  1120. vllm/v1/core/encoder_cache_manager.py +254 -0
  1121. vllm/v1/core/kv_cache_coordinator.py +369 -0
  1122. vllm/v1/core/kv_cache_manager.py +398 -0
  1123. vllm/v1/core/kv_cache_utils.py +999 -0
  1124. vllm/v1/core/sched/__init__.py +0 -0
  1125. vllm/v1/core/sched/interface.py +150 -0
  1126. vllm/v1/core/sched/output.py +157 -0
  1127. vllm/v1/core/sched/request_queue.py +224 -0
  1128. vllm/v1/core/sched/scheduler.py +1115 -0
  1129. vllm/v1/core/sched/utils.py +36 -0
  1130. vllm/v1/core/single_type_kv_cache_manager.py +444 -0
  1131. vllm/v1/engine/__init__.py +179 -0
  1132. vllm/v1/engine/async_llm.py +626 -0
  1133. vllm/v1/engine/coordinator.py +278 -0
  1134. vllm/v1/engine/core.py +1046 -0
  1135. vllm/v1/engine/core_client.py +1049 -0
  1136. vllm/v1/engine/detokenizer.py +292 -0
  1137. vllm/v1/engine/exceptions.py +17 -0
  1138. vllm/v1/engine/llm_engine.py +322 -0
  1139. vllm/v1/engine/logprobs.py +200 -0
  1140. vllm/v1/engine/mm_input_cache.py +91 -0
  1141. vllm/v1/engine/output_processor.py +477 -0
  1142. vllm/v1/engine/parallel_sampling.py +133 -0
  1143. vllm/v1/engine/processor.py +422 -0
  1144. vllm/v1/engine/utils.py +546 -0
  1145. vllm/v1/executor/__init__.py +0 -0
  1146. vllm/v1/executor/abstract.py +113 -0
  1147. vllm/v1/executor/multiproc_executor.py +532 -0
  1148. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1149. vllm/v1/kv_cache_interface.py +223 -0
  1150. vllm/v1/metrics/__init__.py +0 -0
  1151. vllm/v1/metrics/loggers.py +557 -0
  1152. vllm/v1/metrics/prometheus.py +82 -0
  1153. vllm/v1/metrics/ray_wrappers.py +131 -0
  1154. vllm/v1/metrics/reader.py +246 -0
  1155. vllm/v1/metrics/stats.py +240 -0
  1156. vllm/v1/outputs.py +124 -0
  1157. vllm/v1/pool/__init__.py +0 -0
  1158. vllm/v1/pool/metadata.py +17 -0
  1159. vllm/v1/request.py +229 -0
  1160. vllm/v1/sample/__init__.py +0 -0
  1161. vllm/v1/sample/logits_processor.py +517 -0
  1162. vllm/v1/sample/metadata.py +43 -0
  1163. vllm/v1/sample/ops/__init__.py +0 -0
  1164. vllm/v1/sample/ops/bad_words.py +39 -0
  1165. vllm/v1/sample/ops/penalties.py +43 -0
  1166. vllm/v1/sample/ops/topk_topp_sampler.py +296 -0
  1167. vllm/v1/sample/rejection_sampler.py +631 -0
  1168. vllm/v1/sample/sampler.py +226 -0
  1169. vllm/v1/sample/tpu/__init__.py +0 -0
  1170. vllm/v1/sample/tpu/metadata.py +124 -0
  1171. vllm/v1/sample/tpu/sampler.py +145 -0
  1172. vllm/v1/serial_utils.py +315 -0
  1173. vllm/v1/spec_decode/__init__.py +0 -0
  1174. vllm/v1/spec_decode/eagle.py +441 -0
  1175. vllm/v1/spec_decode/medusa.py +64 -0
  1176. vllm/v1/spec_decode/metadata.py +62 -0
  1177. vllm/v1/spec_decode/metrics.py +178 -0
  1178. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1179. vllm/v1/spec_decode/utils.py +41 -0
  1180. vllm/v1/structured_output/__init__.py +227 -0
  1181. vllm/v1/structured_output/backend_guidance.py +245 -0
  1182. vllm/v1/structured_output/backend_types.py +134 -0
  1183. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1184. vllm/v1/structured_output/request.py +86 -0
  1185. vllm/v1/structured_output/utils.py +175 -0
  1186. vllm/v1/utils.py +377 -0
  1187. vllm/v1/worker/__init__.py +0 -0
  1188. vllm/v1/worker/block_table.py +142 -0
  1189. vllm/v1/worker/cpu_model_runner.py +91 -0
  1190. vllm/v1/worker/cpu_worker.py +153 -0
  1191. vllm/v1/worker/gpu_input_batch.py +757 -0
  1192. vllm/v1/worker/gpu_model_runner.py +2739 -0
  1193. vllm/v1/worker/gpu_worker.py +408 -0
  1194. vllm/v1/worker/lora_model_runner_mixin.py +177 -0
  1195. vllm/v1/worker/tpu_input_batch.py +585 -0
  1196. vllm/v1/worker/tpu_model_runner.py +1849 -0
  1197. vllm/v1/worker/tpu_worker.py +315 -0
  1198. vllm/v1/worker/utils.py +112 -0
  1199. vllm/v1/worker/worker_base.py +65 -0
  1200. vllm/v1/worker/xpu_model_runner.py +33 -0
  1201. vllm/v1/worker/xpu_worker.py +165 -0
  1202. vllm/version.py +41 -0
  1203. vllm/vllm_flash_attn/.gitkeep +0 -0
  1204. vllm/worker/__init__.py +0 -0
  1205. vllm/worker/cache_engine.py +145 -0
  1206. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1207. vllm/worker/cpu_model_runner.py +671 -0
  1208. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1209. vllm/worker/cpu_worker.py +452 -0
  1210. vllm/worker/enc_dec_model_runner.py +555 -0
  1211. vllm/worker/hpu_model_runner.py +2320 -0
  1212. vllm/worker/hpu_worker.py +484 -0
  1213. vllm/worker/model_runner.py +2178 -0
  1214. vllm/worker/model_runner_base.py +282 -0
  1215. vllm/worker/multi_step_hpu_worker.py +123 -0
  1216. vllm/worker/multi_step_model_runner.py +911 -0
  1217. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1218. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1219. vllm/worker/multi_step_tpu_worker.py +108 -0
  1220. vllm/worker/multi_step_worker.py +197 -0
  1221. vllm/worker/neuron_model_runner.py +460 -0
  1222. vllm/worker/neuron_worker.py +193 -0
  1223. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1224. vllm/worker/pooling_model_runner.py +211 -0
  1225. vllm/worker/tpu_model_runner.py +909 -0
  1226. vllm/worker/tpu_worker.py +337 -0
  1227. vllm/worker/utils.py +53 -0
  1228. vllm/worker/worker.py +577 -0
  1229. vllm/worker/worker_base.py +646 -0
  1230. vllm/worker/xpu_model_runner.py +606 -0
  1231. vllm/worker/xpu_worker.py +186 -0
  1232. vllm_cpu-0.9.2.post2.dist-info/METADATA +339 -0
  1233. vllm_cpu-0.9.2.post2.dist-info/RECORD +1236 -0
  1234. vllm_cpu-0.9.2.post2.dist-info/WHEEL +5 -0
  1235. vllm_cpu-0.9.2.post2.dist-info/entry_points.txt +5 -0
  1236. vllm_cpu-0.9.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1115 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ from __future__ import annotations
5
+
6
+ import itertools
7
+ import time
8
+ from collections import defaultdict
9
+ from collections.abc import Iterable
10
+ from typing import Any, Optional, Union
11
+
12
+ from vllm.config import VllmConfig
13
+ from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
14
+ from vllm.distributed.kv_transfer.kv_connector.factory import (
15
+ KVConnectorFactory)
16
+ from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
17
+ KVConnectorRole)
18
+ from vllm.logger import init_logger
19
+ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
20
+ from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
21
+ compute_encoder_budget)
22
+ from vllm.v1.core.kv_cache_manager import KVCacheManager
23
+ from vllm.v1.core.sched.interface import SchedulerInterface
24
+ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
25
+ SchedulerOutput)
26
+ from vllm.v1.core.sched.request_queue import (SchedulingPolicy,
27
+ create_request_queue)
28
+ from vllm.v1.core.sched.utils import check_stop
29
+ from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
30
+ EngineCoreOutputs)
31
+ from vllm.v1.kv_cache_interface import KVCacheConfig
32
+ from vllm.v1.metrics.stats import SchedulerStats
33
+ from vllm.v1.outputs import ModelRunnerOutput
34
+ from vllm.v1.request import Request, RequestStatus
35
+ from vllm.v1.spec_decode.metrics import SpecDecodingStats
36
+ from vllm.v1.structured_output import StructuredOutputManager
37
+
38
+ logger = init_logger(__name__)
39
+
40
+
41
+ class Scheduler(SchedulerInterface):
42
+
43
+ def __init__(
44
+ self,
45
+ vllm_config: VllmConfig,
46
+ kv_cache_config: KVCacheConfig,
47
+ structured_output_manager: StructuredOutputManager,
48
+ mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
49
+ include_finished_set: bool = False,
50
+ log_stats: bool = False,
51
+ ) -> None:
52
+ self.vllm_config = vllm_config
53
+ self.scheduler_config = vllm_config.scheduler_config
54
+ self.cache_config = vllm_config.cache_config
55
+ self.lora_config = vllm_config.lora_config
56
+ self.kv_cache_config = kv_cache_config
57
+ self.kv_events_config = vllm_config.kv_events_config
58
+ self.parallel_config = vllm_config.parallel_config
59
+ self.log_stats = log_stats
60
+ self.structured_output_manager = structured_output_manager
61
+
62
+ # include_finished_set controls whether a separate set of finished
63
+ # request ids should be included in the EngineCoreOutputs returned
64
+ # by update_from_outputs(). This is currently used in the multi-engine
65
+ # case to track request lifetimes efficiently.
66
+ self.finished_req_ids_dict: Optional[dict[int, set[str]]] = (
67
+ defaultdict(set) if include_finished_set else None)
68
+
69
+ # Scheduling constraints.
70
+ self.max_num_running_reqs = self.scheduler_config.max_num_seqs
71
+ self.max_num_scheduled_tokens = \
72
+ self.scheduler_config.max_num_batched_tokens
73
+ self.max_model_len = self.scheduler_config.max_model_len
74
+ self.enable_kv_cache_events = (
75
+ self.kv_events_config is not None
76
+ and self.kv_events_config.enable_kv_cache_events)
77
+
78
+ # Create KVConnector for the Scheduler. Note that each Worker
79
+ # will have a corresponding KVConnector with Role=WORKER.
80
+ # KV Connector pushes/pull of remote KVs for P/D and offloading.
81
+ self.connector = None
82
+ if self.vllm_config.kv_transfer_config is not None:
83
+ assert len(self.kv_cache_config.kv_cache_groups) == 1, (
84
+ "Multiple KV cache groups are not currently supported "
85
+ "with KV connectors")
86
+ self.connector = KVConnectorFactory.create_connector_v1(
87
+ config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
88
+
89
+ self.kv_event_publisher = EventPublisherFactory.create(
90
+ self.kv_events_config,
91
+ self.parallel_config.data_parallel_rank,
92
+ )
93
+
94
+ num_gpu_blocks = self.cache_config.num_gpu_blocks
95
+ assert num_gpu_blocks is not None and num_gpu_blocks > 0
96
+
97
+ self.block_size = self.cache_config.block_size
98
+
99
+ # req_id -> Request
100
+ self.requests: dict[str, Request] = {}
101
+ # Scheduling policy
102
+ if self.scheduler_config.policy == "priority":
103
+ self.policy = SchedulingPolicy.PRIORITY
104
+ elif self.scheduler_config.policy == "fcfs":
105
+ self.policy = SchedulingPolicy.FCFS
106
+ else:
107
+ raise ValueError(
108
+ f"Unknown scheduling policy: {self.scheduler_config.policy}")
109
+ # Priority queues for requests.
110
+ self.waiting = create_request_queue(self.policy)
111
+ self.running: list[Request] = []
112
+
113
+ # The request IDs that are finished in between the previous and the
114
+ # current steps. This is used to notify the workers about the finished
115
+ # requests so that they can free the cached states for those requests.
116
+ # This is flushed at the end of each scheduling step.
117
+ self.finished_req_ids: set[str] = set()
118
+
119
+ # KV Connector: requests in process of async KV loading or recving
120
+ self.finished_recving_kv_req_ids: set[str] = set()
121
+
122
+ # Encoder-related.
123
+ # Calculate encoder cache size if applicable
124
+ # NOTE: For now we use the same budget for both compute and space.
125
+ # This can be changed when we make encoder cache for embedding caching
126
+ # across requests.
127
+ encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
128
+ model_config=vllm_config.model_config,
129
+ scheduler_config=vllm_config.scheduler_config,
130
+ mm_registry=mm_registry,
131
+ )
132
+
133
+ # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
134
+ # projector if needed). Currently, we assume that the encoder also
135
+ # has the Transformer architecture (e.g., ViT).
136
+ self.max_num_encoder_input_tokens = encoder_compute_budget
137
+ # NOTE: For the models without encoder (e.g., text-only models),
138
+ # the encoder cache will not be initialized because cache size is 0
139
+ # for these models.
140
+ self.encoder_cache_manager = EncoderCacheManager(
141
+ cache_size=encoder_cache_size)
142
+
143
+ speculative_config = vllm_config.speculative_config
144
+
145
+ self.use_eagle = False
146
+ self.num_spec_tokens = self.num_lookahead_tokens = 0
147
+ if speculative_config:
148
+ self.num_spec_tokens = speculative_config.num_speculative_tokens
149
+ if speculative_config.use_eagle():
150
+ self.use_eagle = True
151
+ self.num_lookahead_tokens = self.num_spec_tokens
152
+
153
+ # Create the KV cache manager.
154
+ self.kv_cache_manager = KVCacheManager(
155
+ kv_cache_config=kv_cache_config,
156
+ max_model_len=self.max_model_len,
157
+ enable_caching=self.cache_config.enable_prefix_caching,
158
+ caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
159
+ use_eagle=self.use_eagle,
160
+ log_stats=self.log_stats,
161
+ enable_kv_cache_events=self.enable_kv_cache_events,
162
+ )
163
+ self.use_pp = self.parallel_config.pipeline_parallel_size > 1
164
+
165
+ def schedule(self) -> SchedulerOutput:
166
+ # NOTE(woosuk) on the scheduling algorithm:
167
+ # There's no "decoding phase" nor "prefill phase" in the scheduler.
168
+ # Each request just has the num_computed_tokens and
169
+ # num_tokens_with_spec. num_tokens_with_spec =
170
+ # len(prompt_token_ids) + len(output_token_ids) + len(spec_token_ids).
171
+ # At each step, the scheduler tries to assign tokens to the requests
172
+ # so that each request's num_computed_tokens can catch up its
173
+ # num_tokens_with_spec. This is general enough to cover
174
+ # chunked prefills, prefix caching, speculative decoding,
175
+ # and the "jump decoding" optimization in the future.
176
+
177
+ scheduled_new_reqs: list[Request] = []
178
+ scheduled_resumed_reqs: list[Request] = []
179
+ scheduled_running_reqs: list[Request] = []
180
+ preempted_reqs: list[Request] = []
181
+
182
+ # NOTE: structured_output_request_ids maps
183
+ # a request's (request that uses structured output)
184
+ # request_id to the running request index.
185
+ # This will helps us determine to slice the grammar bitmask
186
+ # and only applies valid mask for requests that
187
+ # uses structured decoding.
188
+ structured_output_request_ids: dict[str, int] = {}
189
+
190
+ req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {}
191
+ num_scheduled_tokens: dict[str, int] = {}
192
+ token_budget = self.max_num_scheduled_tokens
193
+ # Encoder-related.
194
+ scheduled_encoder_inputs: dict[str, list[int]] = {}
195
+ encoder_budget = self.max_num_encoder_input_tokens
196
+ # Spec decode-related.
197
+ scheduled_spec_decode_tokens: dict[str, list[int]] = {}
198
+
199
+ # For logging.
200
+ scheduled_timestamp = time.monotonic()
201
+
202
+ # First, schedule the RUNNING requests.
203
+ req_index = 0
204
+ while req_index < len(self.running) and token_budget > 0:
205
+ request = self.running[req_index]
206
+
207
+ num_new_tokens = (request.num_tokens_with_spec -
208
+ request.num_computed_tokens)
209
+ if (0 < self.scheduler_config.long_prefill_token_threshold <
210
+ num_new_tokens):
211
+ num_new_tokens = (
212
+ self.scheduler_config.long_prefill_token_threshold)
213
+ num_new_tokens = min(num_new_tokens, token_budget)
214
+
215
+ # Make sure the input position does not exceed the max model len.
216
+ # This is necessary when using spec decoding.
217
+ num_new_tokens = min(
218
+ num_new_tokens,
219
+ self.max_model_len - 1 - request.num_computed_tokens)
220
+
221
+ # Schedule encoder inputs.
222
+ encoder_inputs_to_schedule = None
223
+ new_encoder_budget = encoder_budget
224
+ if request.has_encoder_inputs:
225
+ (encoder_inputs_to_schedule, num_new_tokens,
226
+ new_encoder_budget) = self._try_schedule_encoder_inputs(
227
+ request, request.num_computed_tokens, num_new_tokens,
228
+ encoder_budget)
229
+
230
+ if num_new_tokens == 0:
231
+ # The request cannot be scheduled because one of the following
232
+ # reasons:
233
+ # 1. No new tokens to schedule. This may happen when PP>1 and
234
+ # we have already scheduled all prompt tokens but they are
235
+ # not finished yet.
236
+ # 2. The encoder budget is exhausted.
237
+ # 3. The encoder cache is exhausted.
238
+ # NOTE(woosuk): Here, by doing `continue` instead of `break`,
239
+ # we do not strictly follow the FCFS scheduling policy and
240
+ # allow the lower-priority requests to be scheduled.
241
+ req_index += 1
242
+ continue
243
+
244
+ num_draft_tokens = max(
245
+ num_new_tokens + request.num_computed_tokens -
246
+ request.num_tokens, 0)
247
+
248
+ while True:
249
+ new_blocks = self.kv_cache_manager.allocate_slots(
250
+ request,
251
+ num_new_tokens,
252
+ num_draft_tokens=num_draft_tokens,
253
+ num_lookahead_tokens=self.num_lookahead_tokens)
254
+ if new_blocks is None:
255
+ # The request cannot be scheduled.
256
+ # Preempt the lowest-priority request.
257
+ if self.policy == SchedulingPolicy.PRIORITY:
258
+ preempted_req = max(
259
+ self.running,
260
+ key=lambda r: (r.priority, r.arrival_time),
261
+ )
262
+ self.running.remove(preempted_req)
263
+ else:
264
+ preempted_req = self.running.pop()
265
+
266
+ self.kv_cache_manager.free(preempted_req)
267
+ preempted_req.status = RequestStatus.PREEMPTED
268
+ preempted_req.num_computed_tokens = 0
269
+ if self.log_stats:
270
+ preempted_req.record_event(
271
+ EngineCoreEventType.PREEMPTED, scheduled_timestamp)
272
+
273
+ self.waiting.prepend_request(preempted_req)
274
+ preempted_reqs.append(preempted_req)
275
+ if preempted_req == request:
276
+ # No more request to preempt.
277
+ can_schedule = False
278
+ break
279
+ else:
280
+ # The request can be scheduled.
281
+ can_schedule = True
282
+ break
283
+ if not can_schedule:
284
+ break
285
+ assert new_blocks is not None
286
+
287
+ # Schedule the request.
288
+ scheduled_running_reqs.append(request)
289
+ if request.use_structured_output:
290
+ # PERF: in case of chunked prefill,
291
+ # request might not include any new tokens.
292
+ # Therefore, we might introduce some additional
293
+ # cycle to fill in the bitmask, which could be a big no-op.
294
+ structured_output_request_ids[request.request_id] = req_index
295
+ req_to_new_block_ids[request.request_id] = (
296
+ new_blocks.get_block_ids())
297
+ num_scheduled_tokens[request.request_id] = num_new_tokens
298
+ token_budget -= num_new_tokens
299
+ req_index += 1
300
+
301
+ # Speculative decode related.
302
+ if request.spec_token_ids:
303
+ num_scheduled_spec_tokens = (num_new_tokens +
304
+ request.num_computed_tokens -
305
+ request.num_tokens)
306
+ if num_scheduled_spec_tokens > 0:
307
+ # Trim spec_token_ids list to num_scheduled_spec_tokens.
308
+ del request.spec_token_ids[num_scheduled_spec_tokens:]
309
+ scheduled_spec_decode_tokens[request.request_id] = (
310
+ request.spec_token_ids)
311
+
312
+ # Encoder-related.
313
+ if encoder_inputs_to_schedule:
314
+ scheduled_encoder_inputs[request.request_id] = (
315
+ encoder_inputs_to_schedule)
316
+ # Allocate the encoder cache.
317
+ for i in encoder_inputs_to_schedule:
318
+ self.encoder_cache_manager.allocate(request, i)
319
+ encoder_budget = new_encoder_budget
320
+
321
+ # Record the LoRAs in scheduled_running_reqs
322
+ scheduled_loras: set[int] = set()
323
+ if self.lora_config:
324
+ scheduled_loras = set(
325
+ req.lora_request.lora_int_id for req in scheduled_running_reqs
326
+ if req.lora_request and req.lora_request.lora_int_id > 0)
327
+ assert len(scheduled_loras) <= self.lora_config.max_loras
328
+
329
+ # Use a temporary RequestQueue to collect requests that need to be
330
+ # skipped and put back at the head of the waiting queue later
331
+ skipped_waiting_requests = create_request_queue(self.policy)
332
+
333
+ # Next, schedule the WAITING requests.
334
+ if not preempted_reqs:
335
+ while self.waiting and token_budget > 0:
336
+ if len(self.running) == self.max_num_running_reqs:
337
+ break
338
+
339
+ request = self.waiting.peek_request()
340
+
341
+ # KVTransfer: skip request if still waiting for remote kvs.
342
+ if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
343
+ is_ready = self._update_waiting_for_remote_kv(request)
344
+ if is_ready:
345
+ request.status = RequestStatus.WAITING
346
+ else:
347
+ logger.debug(
348
+ "%s is still in WAITING_FOR_REMOTE_KVS state.",
349
+ request.request_id)
350
+ self.waiting.pop_request()
351
+ skipped_waiting_requests.prepend_request(request)
352
+ continue
353
+
354
+ # Skip request if the structured output request is still waiting
355
+ # for FSM compilation.
356
+ if request.status == RequestStatus.WAITING_FOR_FSM:
357
+ structured_output_req = request.structured_output_request
358
+ if structured_output_req and structured_output_req.grammar:
359
+ request.status = RequestStatus.WAITING
360
+ else:
361
+ self.waiting.pop_request()
362
+ skipped_waiting_requests.prepend_request(request)
363
+ continue
364
+
365
+ # Check that adding the request still respects the max_loras
366
+ # constraint.
367
+ if (self.lora_config and request.lora_request and
368
+ (len(scheduled_loras) == self.lora_config.max_loras and
369
+ request.lora_request.lora_int_id not in scheduled_loras)):
370
+ # Scheduling would exceed max_loras, skip.
371
+ self.waiting.pop_request()
372
+ skipped_waiting_requests.prepend_request(request)
373
+ continue
374
+
375
+ num_external_computed_tokens = 0
376
+ load_kv_async = False
377
+
378
+ # Get already-cached tokens.
379
+ if request.num_computed_tokens == 0:
380
+ # Get locally-cached tokens.
381
+ new_computed_blocks, num_new_local_computed_tokens = \
382
+ self.kv_cache_manager.get_computed_blocks(
383
+ request)
384
+
385
+ # Get externally-cached tokens if using a KVConnector.
386
+ if self.connector is not None:
387
+ num_external_computed_tokens, load_kv_async = (
388
+ self.connector.get_num_new_matched_tokens(
389
+ request, num_new_local_computed_tokens))
390
+
391
+ # Total computed tokens (local + external).
392
+ num_computed_tokens = (num_new_local_computed_tokens +
393
+ num_external_computed_tokens)
394
+ # KVTransfer: WAITING reqs have num_computed_tokens > 0
395
+ # after async KV recvs are completed.
396
+ else:
397
+ new_computed_blocks = (
398
+ self.kv_cache_manager.create_empty_block_list())
399
+ num_new_local_computed_tokens = 0
400
+ num_computed_tokens = request.num_computed_tokens
401
+
402
+ encoder_inputs_to_schedule = None
403
+ new_encoder_budget = encoder_budget
404
+
405
+ # KVTransfer: loading remote KV, do not allocate for new work.
406
+ if load_kv_async:
407
+ assert num_external_computed_tokens > 0
408
+ num_new_tokens = 0
409
+ # Number of tokens to be scheduled.
410
+ else:
411
+ # We use `request.num_tokens` instead of
412
+ # `request.num_prompt_tokens` to consider the resumed
413
+ # requests, which have output tokens.
414
+ num_new_tokens = request.num_tokens - num_computed_tokens
415
+ if (0 < self.scheduler_config.long_prefill_token_threshold
416
+ < num_new_tokens):
417
+ num_new_tokens = (
418
+ self.scheduler_config.long_prefill_token_threshold)
419
+
420
+ # chunked prefill has to be enabled explicitly to allow
421
+ # pooling requests to be chunked
422
+ if not self.scheduler_config.chunked_prefill_enabled and \
423
+ num_new_tokens > token_budget:
424
+ self.waiting.pop_request()
425
+ skipped_waiting_requests.prepend_request(request)
426
+ continue
427
+
428
+ num_new_tokens = min(num_new_tokens, token_budget)
429
+ assert num_new_tokens > 0
430
+
431
+ # Schedule encoder inputs.
432
+ if request.has_encoder_inputs:
433
+ (encoder_inputs_to_schedule, num_new_tokens,
434
+ new_encoder_budget
435
+ ) = self._try_schedule_encoder_inputs(
436
+ request, num_computed_tokens, num_new_tokens,
437
+ encoder_budget)
438
+ if num_new_tokens == 0:
439
+ # The request cannot be scheduled.
440
+ break
441
+
442
+ new_blocks = self.kv_cache_manager.allocate_slots(
443
+ request,
444
+ num_new_tokens + num_external_computed_tokens,
445
+ num_new_local_computed_tokens,
446
+ new_computed_blocks,
447
+ num_lookahead_tokens=self.num_lookahead_tokens,
448
+ delay_cache_blocks=load_kv_async,
449
+ )
450
+ if new_blocks is None:
451
+ # The request cannot be scheduled.
452
+ break
453
+
454
+ # KVTransfer: the connector uses this info to determine
455
+ # if a load is needed. Note that
456
+ # This information is used to determine if a load is
457
+ # needed for this request.
458
+ if self.connector is not None:
459
+ self.connector.update_state_after_alloc(
460
+ request,
461
+ new_computed_blocks + new_blocks,
462
+ num_external_computed_tokens,
463
+ )
464
+
465
+ # Request was already popped from self.waiting
466
+ # unless it was re-added above due to new_blocks being None.
467
+ request = self.waiting.pop_request()
468
+ if load_kv_async:
469
+ # If loading async, allocate memory and put request
470
+ # into the WAITING_FOR_REMOTE_KV state.
471
+ skipped_waiting_requests.prepend_request(request)
472
+ request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
473
+ continue
474
+
475
+ if request.use_structured_output:
476
+ structured_output_request_ids[request.request_id] = (
477
+ req_index)
478
+ req_index += 1
479
+ self.running.append(request)
480
+ if self.log_stats:
481
+ request.record_event(EngineCoreEventType.SCHEDULED,
482
+ scheduled_timestamp)
483
+ if request.status == RequestStatus.WAITING:
484
+ scheduled_new_reqs.append(request)
485
+ elif request.status == RequestStatus.PREEMPTED:
486
+ scheduled_resumed_reqs.append(request)
487
+ else:
488
+ raise RuntimeError(
489
+ f"Invalid request status: {request.status}")
490
+
491
+ if self.lora_config and request.lora_request:
492
+ scheduled_loras.add(request.lora_request.lora_int_id)
493
+ req_to_new_block_ids[request.request_id] = (
494
+ self.kv_cache_manager.get_block_ids(request.request_id))
495
+ num_scheduled_tokens[request.request_id] = num_new_tokens
496
+ token_budget -= num_new_tokens
497
+ request.status = RequestStatus.RUNNING
498
+ request.num_computed_tokens = num_computed_tokens
499
+ # Count the number of prefix cached tokens.
500
+ if request.num_cached_tokens < 0:
501
+ request.num_cached_tokens = num_computed_tokens
502
+ # Encoder-related.
503
+ if encoder_inputs_to_schedule:
504
+ scheduled_encoder_inputs[request.request_id] = (
505
+ encoder_inputs_to_schedule)
506
+ # Allocate the encoder cache.
507
+ for i in encoder_inputs_to_schedule:
508
+ self.encoder_cache_manager.allocate(request, i)
509
+ encoder_budget = new_encoder_budget
510
+
511
+ # Put back any skipped requests at the head of the waiting queue
512
+ if skipped_waiting_requests:
513
+ self.waiting.prepend_requests(skipped_waiting_requests)
514
+
515
+ # Check if the scheduling constraints are satisfied.
516
+ total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
517
+ assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
518
+ assert token_budget >= 0
519
+ assert len(self.running) <= self.max_num_running_reqs
520
+ # Since some requests in the RUNNING queue may not be scheduled in
521
+ # this step, the total number of scheduled requests can be smaller than
522
+ # len(self.running).
523
+ assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) +
524
+ len(scheduled_running_reqs) <= len(self.running))
525
+
526
+ # Get the longest common prefix among all requests in the running queue.
527
+ # This can be potentially used for cascade attention.
528
+ num_common_prefix_blocks = [0] * len(
529
+ self.kv_cache_config.kv_cache_groups)
530
+ if self.running:
531
+ any_request = self.running[0]
532
+ num_common_prefix_blocks = (
533
+ self.kv_cache_manager.get_num_common_prefix_blocks(
534
+ any_request, len(self.running)))
535
+
536
+ grammar_bitmask = self.structured_output_manager.grammar_bitmask(
537
+ self.requests,
538
+ structured_output_request_ids,
539
+ scheduled_spec_decode_tokens,
540
+ )
541
+ # Construct the scheduler output.
542
+ new_reqs_data = [
543
+ NewRequestData.from_request(req,
544
+ req_to_new_block_ids[req.request_id])
545
+ for req in scheduled_new_reqs
546
+ ]
547
+ cached_reqs_data = self._make_cached_request_data(
548
+ scheduled_running_reqs,
549
+ scheduled_resumed_reqs,
550
+ num_scheduled_tokens,
551
+ scheduled_spec_decode_tokens,
552
+ req_to_new_block_ids,
553
+ )
554
+ scheduler_output = SchedulerOutput(
555
+ scheduled_new_reqs=new_reqs_data,
556
+ scheduled_cached_reqs=cached_reqs_data,
557
+ num_scheduled_tokens=num_scheduled_tokens,
558
+ total_num_scheduled_tokens=total_num_scheduled_tokens,
559
+ scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
560
+ scheduled_encoder_inputs=scheduled_encoder_inputs,
561
+ num_common_prefix_blocks=num_common_prefix_blocks,
562
+ # finished_req_ids is an existing state in the scheduler,
563
+ # instead of being newly scheduled in this step.
564
+ # It contains the request IDs that are finished in between
565
+ # the previous and the current steps.
566
+ finished_req_ids=self.finished_req_ids,
567
+ free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
568
+ structured_output_request_ids=structured_output_request_ids,
569
+ grammar_bitmask=grammar_bitmask,
570
+ )
571
+
572
+ # NOTE(Kuntai): this function is designed for multiple purposes:
573
+ # 1. Plan the KV cache store
574
+ # 2. Wrap up all the KV cache load / save ops into an opaque object
575
+ # 3. Clear the internal states of the connector
576
+ if self.connector is not None:
577
+ meta = self.connector.build_connector_meta(scheduler_output)
578
+ scheduler_output.kv_connector_metadata = meta
579
+
580
+ events = self.kv_cache_manager.take_events()
581
+ if events:
582
+ batch = KVEventBatch(ts=time.time(), events=events)
583
+ self.kv_event_publisher.publish(batch)
584
+
585
+ self._update_after_schedule(scheduler_output)
586
+ return scheduler_output
587
+
588
+ def _update_after_schedule(
589
+ self,
590
+ scheduler_output: SchedulerOutput,
591
+ ) -> None:
592
+ # Advance the number of computed tokens for the request AFTER
593
+ # the request is scheduled.
594
+ # 1. The scheduler_output of the current step has to include the
595
+ # original number of scheduled tokens to determine input IDs.
596
+ # 2. Advance the number of computed tokens here allowing us to
597
+ # schedule the prefill request again immediately in the next
598
+ # scheduling step.
599
+ # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
600
+ # computed tokens will be adjusted in update_from_output.
601
+ num_scheduled_tokens = scheduler_output.num_scheduled_tokens
602
+ for req_id, num_scheduled_token in num_scheduled_tokens.items():
603
+ request = self.requests[req_id]
604
+ request.num_computed_tokens += num_scheduled_token
605
+
606
+ # Clear the finished request IDs.
607
+ # NOTE: We shouldn't do self.finished_req_ids.clear() here because
608
+ # it will also affect the scheduler output.
609
+ self.finished_req_ids = set()
610
+
611
+ def _make_cached_request_data(
612
+ self,
613
+ running_reqs: list[Request],
614
+ resumed_reqs: list[Request],
615
+ num_scheduled_tokens: dict[str, int],
616
+ spec_decode_tokens: dict[str, list[int]],
617
+ req_to_new_block_ids: dict[str, tuple[list[int], ...]],
618
+ ) -> CachedRequestData:
619
+ req_ids: list[str] = []
620
+ new_token_ids: list[list[int]] = []
621
+ new_block_ids: list[tuple[list[int], ...]] = []
622
+ num_computed_tokens: list[int] = []
623
+
624
+ for req in itertools.chain(running_reqs, resumed_reqs):
625
+ req_id = req.request_id
626
+ req_ids.append(req_id)
627
+ num_tokens = (num_scheduled_tokens[req_id] -
628
+ len(spec_decode_tokens.get(req_id, ())))
629
+ if self.use_pp:
630
+ # When using PP, the scheduler sends the sampled tokens back,
631
+ # because there's no direct communication between the first-
632
+ # stage worker and the last-stage worker. Otherwise, we don't
633
+ # need to send the sampled tokens back because the model runner
634
+ # will cache them.
635
+ token_ids = req.all_token_ids[req.num_computed_tokens:req.
636
+ num_computed_tokens + num_tokens]
637
+ new_token_ids.append(token_ids)
638
+ new_block_ids.append(req_to_new_block_ids[req_id])
639
+ num_computed_tokens.append(req.num_computed_tokens)
640
+ # Because resumed_reqs is usually empty, it is more efficient to do
641
+ # in-place appending so that we don't need to allocate a new list.
642
+ resumed_from_preemption = [False] * len(running_reqs)
643
+ resumed_from_preemption += [True] * len(resumed_reqs)
644
+
645
+ return CachedRequestData(
646
+ req_ids=req_ids,
647
+ resumed_from_preemption=resumed_from_preemption,
648
+ new_token_ids=new_token_ids,
649
+ new_block_ids=new_block_ids,
650
+ num_computed_tokens=num_computed_tokens,
651
+ )
652
+
653
+ def _try_schedule_encoder_inputs(
654
+ self,
655
+ request: Request,
656
+ num_computed_tokens: int,
657
+ num_new_tokens: int,
658
+ encoder_budget: int,
659
+ ) -> tuple[list[int], int, int]:
660
+ """
661
+ Determine which encoder inputs need to be scheduled in the current step,
662
+ and update `num_new_tokens` and encoder token budget accordingly.
663
+
664
+ An encoder input will be scheduled if:
665
+ - Its output tokens overlap with the range of tokens being computed
666
+ in this step, i.e.,
667
+ [num_computed_tokens, num_computed_tokens + num_new_tokens).
668
+ - It is not already computed and stored in the encoder cache.
669
+ - There is sufficient encoder token budget to process it.
670
+ - The encoder cache has space to store it.
671
+
672
+ If an encoder input cannot be scheduled due to cache or budget
673
+ limitations, the method adjusts `num_new_tokens` to schedule only the
674
+ decoder tokens up to just before the unschedulable encoder input.
675
+
676
+ Note that num_computed_tokens includes both locally cached
677
+ blocks and externally cached blocks (via KVConnector).
678
+ """
679
+ if num_new_tokens == 0 or not request.has_encoder_inputs:
680
+ return [], num_new_tokens, encoder_budget
681
+ encoder_inputs_to_schedule: list[int] = []
682
+ mm_positions = request.mm_positions
683
+ assert mm_positions is not None
684
+ assert len(mm_positions) > 0
685
+ for i, pos_info in enumerate(mm_positions):
686
+ start_pos = pos_info.offset
687
+ num_encoder_tokens = pos_info.length
688
+
689
+ # The encoder output is needed if the two ranges overlap:
690
+ # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
691
+ # [start_pos, start_pos + num_encoder_tokens)
692
+ if start_pos >= num_computed_tokens + num_new_tokens:
693
+ # The encoder input is not needed in this step.
694
+ break
695
+ if start_pos + num_encoder_tokens <= num_computed_tokens:
696
+ # The encoder input is already computed and stored
697
+ # in the decoder's KV cache.
698
+ continue
699
+
700
+ if self.encoder_cache_manager.has_cache(request, i):
701
+ # The encoder input is already computed and cached.
702
+ continue
703
+
704
+ # If no encoder input chunking is allowed, we do not want to
705
+ # partially schedule a multimodal item. If the scheduled range would
706
+ # only cover part of the mm input, roll back to before the mm item.
707
+ if (self.scheduler_config.disable_chunked_mm_input
708
+ and num_computed_tokens < start_pos
709
+ and (num_computed_tokens + num_new_tokens)
710
+ < (start_pos + num_encoder_tokens)):
711
+ num_new_tokens = start_pos - num_computed_tokens
712
+ break
713
+
714
+ if (not self.encoder_cache_manager.can_allocate(request, i)
715
+ or num_encoder_tokens > encoder_budget):
716
+ # The encoder cache is full or the encoder budget is exhausted.
717
+ # NOTE(woosuk): We assume that the encoder input tokens should
718
+ # be processed altogether, as the encoder usually uses
719
+ # bidirectional attention.
720
+ if num_computed_tokens < start_pos:
721
+ # We only schedule the decoder tokens just before the
722
+ # encoder input.
723
+ num_new_tokens = start_pos - num_computed_tokens
724
+ else:
725
+ # Because of prefix caching, num_computed_tokens is greater
726
+ # than start_pos even though its encoder input is not
727
+ # available. In this case, we can't schedule any token for
728
+ # the request in this step.
729
+ num_new_tokens = 0
730
+ break
731
+
732
+ encoder_budget -= num_encoder_tokens
733
+ encoder_inputs_to_schedule.append(i)
734
+ return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
735
+
736
+ def update_from_output(
737
+ self,
738
+ scheduler_output: SchedulerOutput,
739
+ model_runner_output: ModelRunnerOutput,
740
+ ) -> dict[int, EngineCoreOutputs]:
741
+ sampled_token_ids = model_runner_output.sampled_token_ids
742
+ spec_token_ids = model_runner_output.spec_token_ids
743
+ logprobs = model_runner_output.logprobs
744
+ prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
745
+ num_scheduled_tokens = scheduler_output.num_scheduled_tokens
746
+ pooler_outputs = model_runner_output.pooler_output
747
+ num_nans_in_logits = model_runner_output.num_nans_in_logits
748
+
749
+ new_running: list[Request] = []
750
+ outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
751
+ spec_decoding_stats: Optional[SpecDecodingStats] = None
752
+
753
+ # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
754
+ # loop can be a performance bottleneck. We should do our best to avoid
755
+ # expensive operations inside the loop.
756
+ for request in self.running:
757
+ req_id = request.request_id
758
+ num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0)
759
+ if num_tokens_scheduled == 0:
760
+ # The request was not scheduled in this step.
761
+ new_running.append(request)
762
+ continue
763
+
764
+ req_index = model_runner_output.req_id_to_index[req_id]
765
+ generated_token_ids = sampled_token_ids[
766
+ req_index] if sampled_token_ids else []
767
+
768
+ scheduled_spec_token_ids = (
769
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id))
770
+ if scheduled_spec_token_ids:
771
+ # num_computed_tokens represents the number of tokens
772
+ # processed in the current step, considering scheduled
773
+ # tokens and rejections. If some tokens are rejected,
774
+ # num_computed_tokens is decreased by the number of rejected
775
+ # tokens, where is given by:
776
+ # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
777
+ num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
778
+ len(generated_token_ids))
779
+ request.num_computed_tokens -= num_tokens_rejected
780
+ spec_decoding_stats = self.make_spec_decoding_stats(
781
+ spec_decoding_stats,
782
+ num_draft_tokens=len(scheduled_spec_token_ids),
783
+ num_accepted_tokens=len(generated_token_ids) - 1)
784
+
785
+ # NOTE(woosuk): This has to be executed after updating
786
+ # `request.num_computed_tokens`.
787
+ if request.has_encoder_inputs:
788
+ self._free_encoder_inputs(request)
789
+
790
+ stopped = False
791
+ new_logprobs = None
792
+ new_token_ids = generated_token_ids
793
+ kv_transfer_params = None
794
+
795
+ # Append generated tokens and check for stop. Note that if
796
+ # a request is still being prefilled, we expect the model runner
797
+ # to return empty token ids for the request.
798
+ for num_new, output_token_id in enumerate(new_token_ids, 1):
799
+ request.append_output_token_ids(output_token_id)
800
+
801
+ # Check for stop and update request state.
802
+ # This must be called before we make the EngineCoreOutput.
803
+ stopped = check_stop(request, self.max_model_len)
804
+ if stopped:
805
+ kv_transfer_params = self._free_request(request)
806
+ del new_token_ids[num_new:] # Trim new tokens if needed.
807
+ break
808
+
809
+ pooler_output = None
810
+ if pooler_outputs:
811
+ pooler_output = pooler_outputs[req_index]
812
+ stopped = check_stop(request, self.max_model_len,
813
+ pooler_output)
814
+ if stopped:
815
+ kv_transfer_params = self._free_request(request)
816
+
817
+ # Extract sample logprobs if needed.
818
+ if request.sampling_params is not None \
819
+ and request.sampling_params.logprobs is not None and logprobs:
820
+ # NOTE: once we support N tokens per step (spec decode),
821
+ # the outer lists can be of length > 1.
822
+ new_logprobs = logprobs.slice(req_index, req_index + 1)
823
+
824
+ if new_token_ids and self.structured_output_manager.should_advance(
825
+ request):
826
+ # NOTE: structured_output_request
827
+ # should not be None if use_structured_output, we have
828
+ # check above, so safe to ignore type warning
829
+ request.structured_output_request.grammar.accept_tokens( # type: ignore[union-attr]
830
+ req_id, new_token_ids)
831
+
832
+ # spec_token_ids comes from the model runner output
833
+ if num_nans_in_logits is not None and req_id in num_nans_in_logits:
834
+ request.num_nans_in_logits = num_nans_in_logits[req_id]
835
+
836
+ # Add newly generated spec token ids to the request.
837
+ if spec_token_ids is not None:
838
+ if self.structured_output_manager.should_advance(request):
839
+ metadata = request.structured_output_request
840
+ # Needs to happen after new_token_ids are accepted.
841
+ request.spec_token_ids = metadata.grammar.validate_tokens( # type: ignore[union-attr]
842
+ spec_token_ids[req_index])
843
+ else:
844
+ request.spec_token_ids = spec_token_ids[req_index]
845
+
846
+ # Get prompt logprobs for this request.
847
+ prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
848
+ if new_token_ids or pooler_output is not None \
849
+ or kv_transfer_params:
850
+
851
+ # Add EngineCoreOutput for this Request.
852
+ outputs[request.client_index].append(
853
+ EngineCoreOutput(
854
+ request_id=req_id,
855
+ new_token_ids=new_token_ids,
856
+ finish_reason=request.get_finished_reason(),
857
+ new_logprobs=new_logprobs,
858
+ new_prompt_logprobs_tensors=prompt_logprobs_tensors,
859
+ pooling_output=pooler_output,
860
+ stop_reason=request.stop_reason,
861
+ events=request.take_events(),
862
+ kv_transfer_params=kv_transfer_params,
863
+ num_cached_tokens=request.num_cached_tokens,
864
+ ))
865
+
866
+ else:
867
+ # Invariant: EngineCore returns no partial prefill outputs.
868
+ assert not prompt_logprobs_tensors
869
+
870
+ if not stopped:
871
+ new_running.append(request)
872
+ self.running = new_running
873
+
874
+ # KV Connector: update state for finished KV Transfers.
875
+ self._update_from_kv_xfer_finished(model_runner_output)
876
+
877
+ # Create EngineCoreOutputs for all clients that have requests with
878
+ # outputs in this step.
879
+ engine_core_outputs = {
880
+ client_index: EngineCoreOutputs(outputs=outs)
881
+ for client_index, outs in outputs.items()
882
+ }
883
+
884
+ finished_req_ids = self.finished_req_ids_dict
885
+ if finished_req_ids:
886
+ # Include ids of requests that finished since last outputs
887
+ # were sent.
888
+ for client_index, finished_set in finished_req_ids.items():
889
+ # Set finished request set in EngineCoreOutputs for this client.
890
+ if (eco := engine_core_outputs.get(client_index)) is not None:
891
+ eco.finished_requests = finished_set
892
+ else:
893
+ engine_core_outputs[client_index] = EngineCoreOutputs(
894
+ finished_requests=finished_set)
895
+ finished_req_ids.clear()
896
+
897
+ if engine_core_outputs:
898
+ # Return stats to only one of the front-ends.
899
+ next(iter(engine_core_outputs.values())).scheduler_stats = (
900
+ self.make_stats(spec_decoding_stats))
901
+
902
+ return engine_core_outputs
903
+
904
+ def _free_encoder_inputs(self, request: Request) -> None:
905
+ cached_encoder_input_ids = (
906
+ self.encoder_cache_manager.get_cached_input_ids(request))
907
+ # OPTIMIZATION: Avoid list(set) if the set is empty.
908
+ if not cached_encoder_input_ids:
909
+ return
910
+
911
+ # Here, we use list(set) to avoid modifying the set while iterating
912
+ # over it.
913
+ for input_id in list(cached_encoder_input_ids):
914
+ mm_positions = request.mm_positions[input_id]
915
+ start_pos = mm_positions.offset
916
+ num_tokens = mm_positions.length
917
+ if start_pos + num_tokens <= request.num_computed_tokens:
918
+ # The encoder output is already processed and stored
919
+ # in the decoder's KV cache.
920
+ self.encoder_cache_manager.free_encoder_input(
921
+ request, input_id)
922
+
923
+ def get_request_counts(self) -> tuple[int, int]:
924
+ """Returns (num_running_reqs, num_waiting_reqs)."""
925
+ return len(self.running), len(self.waiting)
926
+
927
+ def add_request(self, request: Request) -> None:
928
+ self.waiting.add_request(request)
929
+ self.requests[request.request_id] = request
930
+ if self.log_stats:
931
+ request.record_event(EngineCoreEventType.QUEUED)
932
+
933
+ def finish_requests(
934
+ self,
935
+ request_ids: Union[str, Iterable[str]],
936
+ finished_status: RequestStatus,
937
+ ) -> None:
938
+ """Handles the finish signal from outside the scheduler.
939
+
940
+ For example, the API server can abort a request when the client
941
+ disconnects.
942
+ """
943
+ assert RequestStatus.is_finished(finished_status)
944
+ if isinstance(request_ids, str):
945
+ request_ids = (request_ids, )
946
+ else:
947
+ request_ids = set(request_ids)
948
+
949
+ running_requests_to_remove = []
950
+ waiting_requests_to_remove = []
951
+ valid_requests = []
952
+
953
+ # First pass: collect requests to remove from queues
954
+ for req_id in request_ids:
955
+ request = self.requests.get(req_id)
956
+ if request is None:
957
+ # Invalid request ID.
958
+ continue
959
+
960
+ valid_requests.append(request)
961
+ if request.status == RequestStatus.RUNNING:
962
+ running_requests_to_remove.append(request)
963
+ else:
964
+ waiting_requests_to_remove.append(request)
965
+
966
+ # Remove all requests from queues at once for better efficiency
967
+ for request in running_requests_to_remove:
968
+ self.running.remove(request)
969
+ if waiting_requests_to_remove:
970
+ self.waiting.remove_requests(waiting_requests_to_remove)
971
+
972
+ # Second pass: set status and free requests
973
+ for request in valid_requests:
974
+ request.status = finished_status
975
+ self._free_request(request)
976
+
977
+ def _free_request(self, request: Request) -> Optional[dict[str, Any]]:
978
+ assert request.is_finished()
979
+
980
+ delay_free_blocks, kv_xfer_params = self._connector_finished(request)
981
+ self.encoder_cache_manager.free(request)
982
+ request_id = request.request_id
983
+ self.finished_req_ids.add(request_id)
984
+ if self.finished_req_ids_dict is not None:
985
+ self.finished_req_ids_dict[request.client_index].add(request_id)
986
+
987
+ if not delay_free_blocks:
988
+ self._free_blocks(request)
989
+
990
+ return kv_xfer_params
991
+
992
+ def _free_blocks(self, request: Request):
993
+ assert request.is_finished()
994
+ self.kv_cache_manager.free(request)
995
+ self.kv_cache_manager.free_block_hashes(request)
996
+ del self.requests[request.request_id]
997
+
998
+ def get_num_unfinished_requests(self) -> int:
999
+ return len(self.waiting) + len(self.running)
1000
+
1001
+ def has_finished_requests(self) -> bool:
1002
+ return len(self.finished_req_ids) > 0
1003
+
1004
+ def reset_prefix_cache(self) -> bool:
1005
+ return self.kv_cache_manager.reset_prefix_cache()
1006
+
1007
+ def make_stats(
1008
+ self,
1009
+ spec_decoding_stats: Optional[SpecDecodingStats] = None,
1010
+ ) -> Optional[SchedulerStats]:
1011
+ if not self.log_stats:
1012
+ return None
1013
+ prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
1014
+ assert prefix_cache_stats is not None
1015
+ return SchedulerStats(
1016
+ num_running_reqs=len(self.running),
1017
+ num_waiting_reqs=len(self.waiting),
1018
+ kv_cache_usage=self.kv_cache_manager.usage,
1019
+ prefix_cache_stats=prefix_cache_stats,
1020
+ spec_decoding_stats=spec_decoding_stats,
1021
+ num_corrupted_reqs=sum(req.is_output_corrupted
1022
+ for req in self.running),
1023
+ )
1024
+
1025
+ def make_spec_decoding_stats(
1026
+ self,
1027
+ spec_decoding_stats: Optional[SpecDecodingStats],
1028
+ num_draft_tokens: int,
1029
+ num_accepted_tokens: int,
1030
+ ) -> Optional[SpecDecodingStats]:
1031
+ if not self.log_stats:
1032
+ return None
1033
+ if spec_decoding_stats is None:
1034
+ spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
1035
+ spec_decoding_stats.observe_draft(
1036
+ num_draft_tokens=num_draft_tokens,
1037
+ num_accepted_tokens=num_accepted_tokens)
1038
+ return spec_decoding_stats
1039
+
1040
+ def shutdown(self) -> None:
1041
+ if self.kv_event_publisher:
1042
+ self.kv_event_publisher.shutdown()
1043
+
1044
+ ########################################################################
1045
+ # KV Connector Related Methods
1046
+ ########################################################################
1047
+
1048
+ def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
1049
+ return self.connector
1050
+
1051
+ def _connector_finished(
1052
+ self, request: Request) -> tuple[bool, Optional[dict[str, Any]]]:
1053
+ """
1054
+ Invoke the KV connector request_finished() method if applicable.
1055
+
1056
+ Returns optional kv transfer parameters to be included with the
1057
+ request outputs.
1058
+ """
1059
+ if self.connector is None:
1060
+ return False, None
1061
+
1062
+ (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
1063
+ return self.connector.request_finished(request, block_ids)
1064
+
1065
+ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
1066
+ """
1067
+ KV Connector: check if the request_id is finished_recving.
1068
+
1069
+ The finished_recving_kv_req_ids list is populated
1070
+ on the previous steps()'s update_from_output based
1071
+ on the worker side connector.
1072
+
1073
+ When the kv transfer is ready, we cache the blocks
1074
+ and the request state will be moved back to WAITING from
1075
+ WAITING_FOR_REMOTE_KV.
1076
+ """
1077
+ assert self.connector is not None
1078
+ if request.request_id not in self.finished_recving_kv_req_ids:
1079
+ return False
1080
+
1081
+ # Now that the blocks are ready, actually cache them.
1082
+ (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
1083
+ num_computed_tokens = len(block_ids) * self.block_size
1084
+ # Handle the case where num request tokens less then one block.
1085
+ num_computed_tokens = min(num_computed_tokens, request.num_tokens)
1086
+ if num_computed_tokens == request.num_tokens:
1087
+ num_computed_tokens -= 1
1088
+ # This will cache the blocks iff caching is enabled.
1089
+ self.kv_cache_manager.cache_blocks(request, num_computed_tokens)
1090
+
1091
+ # Update the request state for scheduling.
1092
+ request.num_computed_tokens = num_computed_tokens
1093
+
1094
+ # Return that we are ready.
1095
+ self.finished_recving_kv_req_ids.remove(request.request_id)
1096
+ return True
1097
+
1098
+ def _update_from_kv_xfer_finished(self,
1099
+ model_runner_output: ModelRunnerOutput):
1100
+ """
1101
+ KV Connector: update the scheduler state based on the output.
1102
+
1103
+ The Worker side connectors add finished_recving and
1104
+ finished_sending reqs to the output.
1105
+ * if finished_sending: free the blocks
1106
+ # if finished_recving: add to state so we can
1107
+ scheduler the request during the next step.
1108
+ """
1109
+ # KV Connector:: update recv and send status from last step.
1110
+ for req_id in (model_runner_output.finished_recving or ()):
1111
+ logger.debug("Finished recving KV transfer for request %s", req_id)
1112
+ self.finished_recving_kv_req_ids.add(req_id)
1113
+ for req_id in (model_runner_output.finished_sending or ()):
1114
+ logger.debug("Finished sending KV transfer for request %s", req_id)
1115
+ self._free_blocks(self.requests[req_id])