vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1536 -0
  4. vllm/_ipex_ops.py +241 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +38 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +31 -0
  16. vllm/assets/video.py +103 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +303 -0
  22. vllm/attention/backends/flash_attn.py +999 -0
  23. vllm/attention/backends/flashinfer.py +1092 -0
  24. vllm/attention/backends/flashmla.py +242 -0
  25. vllm/attention/backends/hpu_attn.py +301 -0
  26. vllm/attention/backends/ipex_attn.py +396 -0
  27. vllm/attention/backends/mla/__init__.py +0 -0
  28. vllm/attention/backends/mla/common.py +1444 -0
  29. vllm/attention/backends/pallas.py +346 -0
  30. vllm/attention/backends/placeholder_attn.py +399 -0
  31. vllm/attention/backends/rocm_aiter_mla.py +412 -0
  32. vllm/attention/backends/rocm_flash_attn.py +969 -0
  33. vllm/attention/backends/torch_sdpa.py +691 -0
  34. vllm/attention/backends/triton_mla.py +113 -0
  35. vllm/attention/backends/utils.py +609 -0
  36. vllm/attention/backends/xformers.py +798 -0
  37. vllm/attention/layer.py +443 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  41. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  42. vllm/attention/ops/blocksparse_attention/utils.py +244 -0
  43. vllm/attention/ops/chunked_prefill_paged_decode.py +366 -0
  44. vllm/attention/ops/flashmla.py +115 -0
  45. vllm/attention/ops/hpu_paged_attn.py +105 -0
  46. vllm/attention/ops/ipex_attn.py +193 -0
  47. vllm/attention/ops/merge_attn_states.py +42 -0
  48. vllm/attention/ops/nki_flash_attn.py +905 -0
  49. vllm/attention/ops/paged_attn.py +255 -0
  50. vllm/attention/ops/prefix_prefill.py +902 -0
  51. vllm/attention/ops/rocm_aiter_mla.py +42 -0
  52. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  53. vllm/attention/ops/triton_decode_attention.py +675 -0
  54. vllm/attention/ops/triton_flash_attention.py +1375 -0
  55. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  56. vllm/attention/selector.py +186 -0
  57. vllm/attention/utils/fa_utils.py +54 -0
  58. vllm/beam_search.py +82 -0
  59. vllm/benchmarks/__init__.py +0 -0
  60. vllm/benchmarks/datasets.py +831 -0
  61. vllm/benchmarks/endpoint_request_func.py +160 -0
  62. vllm/benchmarks/latency.py +181 -0
  63. vllm/benchmarks/serve.py +925 -0
  64. vllm/benchmarks/throughput.py +608 -0
  65. vllm/benchmarks/utils.py +69 -0
  66. vllm/collect_env.py +795 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/backends.py +715 -0
  69. vllm/compilation/compiler_interface.py +437 -0
  70. vllm/compilation/counter.py +33 -0
  71. vllm/compilation/decorators.py +249 -0
  72. vllm/compilation/fix_functionalization.py +182 -0
  73. vllm/compilation/fusion.py +617 -0
  74. vllm/compilation/fx_utils.py +60 -0
  75. vllm/compilation/inductor_pass.py +114 -0
  76. vllm/compilation/monitor.py +38 -0
  77. vllm/compilation/multi_output_match.py +108 -0
  78. vllm/compilation/noop_elimination.py +135 -0
  79. vllm/compilation/pass_manager.py +74 -0
  80. vllm/compilation/sequence_parallelism.py +266 -0
  81. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  82. vllm/compilation/vllm_inductor_pass.py +68 -0
  83. vllm/compilation/wrapper.py +129 -0
  84. vllm/config.py +4179 -0
  85. vllm/connections.py +170 -0
  86. vllm/core/__init__.py +0 -0
  87. vllm/core/block/__init__.py +0 -0
  88. vllm/core/block/block_table.py +398 -0
  89. vllm/core/block/common.py +370 -0
  90. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  91. vllm/core/block/interfaces.py +318 -0
  92. vllm/core/block/naive_block.py +465 -0
  93. vllm/core/block/prefix_caching_block.py +1134 -0
  94. vllm/core/block/utils.py +27 -0
  95. vllm/core/block_manager.py +520 -0
  96. vllm/core/evictor.py +156 -0
  97. vllm/core/interfaces.py +134 -0
  98. vllm/core/placeholder_block_space_manager.py +99 -0
  99. vllm/core/scheduler.py +2060 -0
  100. vllm/device_allocator/__init__.py +0 -0
  101. vllm/device_allocator/cumem.py +280 -0
  102. vllm/distributed/__init__.py +5 -0
  103. vllm/distributed/communication_op.py +40 -0
  104. vllm/distributed/device_communicators/__init__.py +0 -0
  105. vllm/distributed/device_communicators/base_device_communicator.py +151 -0
  106. vllm/distributed/device_communicators/cpu_communicator.py +139 -0
  107. vllm/distributed/device_communicators/cuda_communicator.py +131 -0
  108. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  109. vllm/distributed/device_communicators/custom_all_reduce.py +301 -0
  110. vllm/distributed/device_communicators/custom_all_reduce_utils.py +257 -0
  111. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  112. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  113. vllm/distributed/device_communicators/pynccl.py +217 -0
  114. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  115. vllm/distributed/device_communicators/shm_broadcast.py +557 -0
  116. vllm/distributed/device_communicators/tpu_communicator.py +93 -0
  117. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  118. vllm/distributed/kv_transfer/README.md +29 -0
  119. vllm/distributed/kv_transfer/__init__.py +11 -0
  120. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  121. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  122. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  123. vllm/distributed/kv_transfer/kv_connector/factory.py +107 -0
  124. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  125. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +201 -0
  126. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +90 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +8 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +209 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +131 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  132. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  133. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  134. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  135. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  136. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  137. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  138. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  139. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  140. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  141. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  142. vllm/distributed/parallel_state.py +1209 -0
  143. vllm/distributed/utils.py +366 -0
  144. vllm/engine/__init__.py +0 -0
  145. vllm/engine/arg_utils.py +1724 -0
  146. vllm/engine/async_llm_engine.py +1261 -0
  147. vllm/engine/async_timeout.py +191 -0
  148. vllm/engine/llm_engine.py +2150 -0
  149. vllm/engine/metrics.py +717 -0
  150. vllm/engine/metrics_types.py +96 -0
  151. vllm/engine/multiprocessing/__init__.py +183 -0
  152. vllm/engine/multiprocessing/client.py +745 -0
  153. vllm/engine/multiprocessing/engine.py +450 -0
  154. vllm/engine/output_processor/__init__.py +0 -0
  155. vllm/engine/output_processor/interfaces.py +74 -0
  156. vllm/engine/output_processor/multi_step.py +210 -0
  157. vllm/engine/output_processor/single_step.py +136 -0
  158. vllm/engine/output_processor/stop_checker.py +130 -0
  159. vllm/engine/output_processor/util.py +27 -0
  160. vllm/engine/protocol.py +302 -0
  161. vllm/entrypoints/__init__.py +0 -0
  162. vllm/entrypoints/api_server.py +177 -0
  163. vllm/entrypoints/chat_utils.py +1259 -0
  164. vllm/entrypoints/cli/__init__.py +0 -0
  165. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  166. vllm/entrypoints/cli/benchmark/base.py +38 -0
  167. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  168. vllm/entrypoints/cli/benchmark/main.py +53 -0
  169. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  170. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  171. vllm/entrypoints/cli/collect_env.py +35 -0
  172. vllm/entrypoints/cli/main.py +59 -0
  173. vllm/entrypoints/cli/openai.py +175 -0
  174. vllm/entrypoints/cli/serve.py +59 -0
  175. vllm/entrypoints/cli/types.py +24 -0
  176. vllm/entrypoints/launcher.py +146 -0
  177. vllm/entrypoints/llm.py +1450 -0
  178. vllm/entrypoints/logger.py +44 -0
  179. vllm/entrypoints/openai/__init__.py +0 -0
  180. vllm/entrypoints/openai/api_server.py +1130 -0
  181. vllm/entrypoints/openai/cli_args.py +296 -0
  182. vllm/entrypoints/openai/logits_processors.py +89 -0
  183. vllm/entrypoints/openai/protocol.py +1806 -0
  184. vllm/entrypoints/openai/run_batch.py +439 -0
  185. vllm/entrypoints/openai/serving_chat.py +1210 -0
  186. vllm/entrypoints/openai/serving_completion.py +557 -0
  187. vllm/entrypoints/openai/serving_embedding.py +245 -0
  188. vllm/entrypoints/openai/serving_engine.py +569 -0
  189. vllm/entrypoints/openai/serving_models.py +314 -0
  190. vllm/entrypoints/openai/serving_pooling.py +237 -0
  191. vllm/entrypoints/openai/serving_score.py +439 -0
  192. vllm/entrypoints/openai/serving_tokenization.py +147 -0
  193. vllm/entrypoints/openai/serving_transcription.py +421 -0
  194. vllm/entrypoints/openai/tool_parsers/__init__.py +19 -0
  195. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  196. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +254 -0
  197. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +232 -0
  198. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  199. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +211 -0
  200. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +303 -0
  201. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +262 -0
  202. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  203. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +110 -0
  204. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +292 -0
  205. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  206. vllm/entrypoints/score_utils.py +49 -0
  207. vllm/entrypoints/ssl.py +74 -0
  208. vllm/entrypoints/utils.py +136 -0
  209. vllm/env_override.py +34 -0
  210. vllm/envs.py +800 -0
  211. vllm/executor/__init__.py +0 -0
  212. vllm/executor/executor_base.py +400 -0
  213. vllm/executor/mp_distributed_executor.py +243 -0
  214. vllm/executor/msgspec_utils.py +29 -0
  215. vllm/executor/multiproc_worker_utils.py +312 -0
  216. vllm/executor/ray_distributed_executor.py +700 -0
  217. vllm/executor/ray_utils.py +400 -0
  218. vllm/executor/uniproc_executor.py +141 -0
  219. vllm/forward_context.py +159 -0
  220. vllm/inputs/__init__.py +37 -0
  221. vllm/inputs/data.py +248 -0
  222. vllm/inputs/parse.py +121 -0
  223. vllm/inputs/preprocess.py +745 -0
  224. vllm/inputs/registry.py +212 -0
  225. vllm/jsontree.py +79 -0
  226. vllm/logger.py +210 -0
  227. vllm/logging_utils/__init__.py +7 -0
  228. vllm/logging_utils/formatter.py +17 -0
  229. vllm/logits_process.py +121 -0
  230. vllm/lora/__init__.py +0 -0
  231. vllm/lora/fully_sharded_layers.py +335 -0
  232. vllm/lora/layers.py +1263 -0
  233. vllm/lora/lora.py +198 -0
  234. vllm/lora/models.py +802 -0
  235. vllm/lora/ops/__init__.py +0 -0
  236. vllm/lora/ops/torch_ops/__init__.py +15 -0
  237. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  238. vllm/lora/ops/triton_ops/__init__.py +11 -0
  239. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  240. vllm/lora/ops/triton_ops/lora_expand.py +293 -0
  241. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  242. vllm/lora/ops/triton_ops/lora_shrink.py +247 -0
  243. vllm/lora/ops/triton_ops/utils.py +121 -0
  244. vllm/lora/peft_helper.py +115 -0
  245. vllm/lora/punica_wrapper/__init__.py +9 -0
  246. vllm/lora/punica_wrapper/punica_base.py +483 -0
  247. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  248. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  249. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  250. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  251. vllm/lora/punica_wrapper/utils.py +161 -0
  252. vllm/lora/request.py +97 -0
  253. vllm/lora/resolver.py +83 -0
  254. vllm/lora/utils.py +237 -0
  255. vllm/lora/worker_manager.py +251 -0
  256. vllm/model_executor/__init__.py +15 -0
  257. vllm/model_executor/custom_op.py +153 -0
  258. vllm/model_executor/guided_decoding/__init__.py +180 -0
  259. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  260. vllm/model_executor/guided_decoding/guidance_logits_processors.py +85 -0
  261. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  262. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  263. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  264. vllm/model_executor/guided_decoding/outlines_logits_processors.py +271 -0
  265. vllm/model_executor/guided_decoding/reasoner/__init__.py +35 -0
  266. vllm/model_executor/guided_decoding/utils.py +241 -0
  267. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  268. vllm/model_executor/layers/__init__.py +0 -0
  269. vllm/model_executor/layers/activation.py +368 -0
  270. vllm/model_executor/layers/fused_moe/__init__.py +51 -0
  271. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  272. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  273. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  274. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  275. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  276. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  277. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  278. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  279. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  280. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  281. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  282. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  283. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  284. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  285. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  286. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  287. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  426. vllm/model_executor/layers/fused_moe/cutlass_moe.py +180 -0
  427. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +294 -0
  428. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +374 -0
  429. vllm/model_executor/layers/fused_moe/fused_moe.py +1539 -0
  430. vllm/model_executor/layers/fused_moe/layer.py +949 -0
  431. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  432. vllm/model_executor/layers/fused_moe/moe_pallas.py +64 -0
  433. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  434. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +416 -0
  435. vllm/model_executor/layers/fused_moe/utils.py +48 -0
  436. vllm/model_executor/layers/layernorm.py +277 -0
  437. vllm/model_executor/layers/lightning_attn.py +651 -0
  438. vllm/model_executor/layers/linear.py +1518 -0
  439. vllm/model_executor/layers/logits_processor.py +196 -0
  440. vllm/model_executor/layers/mamba/__init__.py +0 -0
  441. vllm/model_executor/layers/mamba/mamba2_metadata.py +109 -0
  442. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  443. vllm/model_executor/layers/mamba/mamba_mixer2.py +538 -0
  444. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  445. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  446. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +415 -0
  447. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  448. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  449. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  450. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  451. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  452. vllm/model_executor/layers/pooler.py +336 -0
  453. vllm/model_executor/layers/quantization/__init__.py +153 -0
  454. vllm/model_executor/layers/quantization/aqlm.py +374 -0
  455. vllm/model_executor/layers/quantization/awq.py +184 -0
  456. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  457. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  458. vllm/model_executor/layers/quantization/base_config.py +145 -0
  459. vllm/model_executor/layers/quantization/bitblas.py +459 -0
  460. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  461. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  462. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +624 -0
  463. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1100 -0
  464. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +20 -0
  465. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  466. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  467. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  468. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +119 -0
  469. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  470. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  471. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  472. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  473. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +213 -0
  474. vllm/model_executor/layers/quantization/deepspeedfp.py +193 -0
  475. vllm/model_executor/layers/quantization/experts_int8.py +194 -0
  476. vllm/model_executor/layers/quantization/fbgemm_fp8.py +168 -0
  477. vllm/model_executor/layers/quantization/fp8.py +832 -0
  478. vllm/model_executor/layers/quantization/gguf.py +408 -0
  479. vllm/model_executor/layers/quantization/gptq.py +276 -0
  480. vllm/model_executor/layers/quantization/gptq_bitblas.py +438 -0
  481. vllm/model_executor/layers/quantization/gptq_marlin.py +643 -0
  482. vllm/model_executor/layers/quantization/gptq_marlin_24.py +295 -0
  483. vllm/model_executor/layers/quantization/hqq_marlin.py +328 -0
  484. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  485. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  486. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  487. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  488. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  489. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  490. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  491. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  492. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +132 -0
  493. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  494. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  495. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  496. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  497. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  498. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  499. vllm/model_executor/layers/quantization/kv_cache.py +137 -0
  500. vllm/model_executor/layers/quantization/marlin.py +259 -0
  501. vllm/model_executor/layers/quantization/modelopt.py +410 -0
  502. vllm/model_executor/layers/quantization/moe_wna16.py +447 -0
  503. vllm/model_executor/layers/quantization/neuron_quant.py +67 -0
  504. vllm/model_executor/layers/quantization/ptpc_fp8.py +125 -0
  505. vllm/model_executor/layers/quantization/qqq.py +273 -0
  506. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  507. vllm/model_executor/layers/quantization/quark/quark.py +385 -0
  508. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  509. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +7 -0
  510. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  511. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +142 -0
  512. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  513. vllm/model_executor/layers/quantization/quark/utils.py +102 -0
  514. vllm/model_executor/layers/quantization/schema.py +85 -0
  515. vllm/model_executor/layers/quantization/torchao.py +127 -0
  516. vllm/model_executor/layers/quantization/tpu_int8.py +119 -0
  517. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  518. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  519. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +198 -0
  520. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  521. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  522. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  523. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  524. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  525. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  526. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  527. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  528. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  529. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  530. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  531. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  532. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  533. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  534. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  535. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  536. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  537. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  538. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  539. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  540. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  541. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  542. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  543. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  544. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  545. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  546. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  547. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  548. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  549. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  550. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  551. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/fp8_utils.py +523 -0
  723. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  724. vllm/model_executor/layers/quantization/utils/int8_utils.py +459 -0
  725. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  726. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  727. vllm/model_executor/layers/quantization/utils/marlin_utils.py +413 -0
  728. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +110 -0
  729. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  730. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  731. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +127 -0
  732. vllm/model_executor/layers/quantization/utils/quant_utils.py +571 -0
  733. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  734. vllm/model_executor/layers/rejection_sampler.py +400 -0
  735. vllm/model_executor/layers/resampler.py +269 -0
  736. vllm/model_executor/layers/rotary_embedding.py +1598 -0
  737. vllm/model_executor/layers/sampler.py +1221 -0
  738. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  739. vllm/model_executor/layers/typical_acceptance_sampler.py +172 -0
  740. vllm/model_executor/layers/utils.py +99 -0
  741. vllm/model_executor/layers/vocab_parallel_embedding.py +485 -0
  742. vllm/model_executor/model_loader/__init__.py +20 -0
  743. vllm/model_executor/model_loader/loader.py +1542 -0
  744. vllm/model_executor/model_loader/neuron.py +243 -0
  745. vllm/model_executor/model_loader/tensorizer.py +468 -0
  746. vllm/model_executor/model_loader/utils.py +171 -0
  747. vllm/model_executor/model_loader/weight_utils.py +749 -0
  748. vllm/model_executor/models/__init__.py +27 -0
  749. vllm/model_executor/models/adapters.py +247 -0
  750. vllm/model_executor/models/arctic.py +559 -0
  751. vllm/model_executor/models/aria.py +656 -0
  752. vllm/model_executor/models/aya_vision.py +461 -0
  753. vllm/model_executor/models/baichuan.py +469 -0
  754. vllm/model_executor/models/bamba.py +542 -0
  755. vllm/model_executor/models/bart.py +936 -0
  756. vllm/model_executor/models/bert.py +725 -0
  757. vllm/model_executor/models/blip.py +337 -0
  758. vllm/model_executor/models/blip2.py +717 -0
  759. vllm/model_executor/models/bloom.py +358 -0
  760. vllm/model_executor/models/chameleon.py +1135 -0
  761. vllm/model_executor/models/chatglm.py +476 -0
  762. vllm/model_executor/models/clip.py +410 -0
  763. vllm/model_executor/models/commandr.py +466 -0
  764. vllm/model_executor/models/constant_size_cache.py +136 -0
  765. vllm/model_executor/models/dbrx.py +469 -0
  766. vllm/model_executor/models/deepseek.py +484 -0
  767. vllm/model_executor/models/deepseek_mtp.py +266 -0
  768. vllm/model_executor/models/deepseek_v2.py +830 -0
  769. vllm/model_executor/models/deepseek_vl2.py +647 -0
  770. vllm/model_executor/models/eagle.py +247 -0
  771. vllm/model_executor/models/exaone.py +548 -0
  772. vllm/model_executor/models/fairseq2_llama.py +153 -0
  773. vllm/model_executor/models/falcon.py +508 -0
  774. vllm/model_executor/models/florence2.py +1102 -0
  775. vllm/model_executor/models/fuyu.py +388 -0
  776. vllm/model_executor/models/gemma.py +423 -0
  777. vllm/model_executor/models/gemma2.py +423 -0
  778. vllm/model_executor/models/gemma3.py +531 -0
  779. vllm/model_executor/models/gemma3_mm.py +716 -0
  780. vllm/model_executor/models/glm.py +22 -0
  781. vllm/model_executor/models/glm4.py +303 -0
  782. vllm/model_executor/models/glm4v.py +647 -0
  783. vllm/model_executor/models/gpt2.py +313 -0
  784. vllm/model_executor/models/gpt_bigcode.py +336 -0
  785. vllm/model_executor/models/gpt_j.py +337 -0
  786. vllm/model_executor/models/gpt_neox.py +330 -0
  787. vllm/model_executor/models/granite.py +494 -0
  788. vllm/model_executor/models/granite_speech.py +777 -0
  789. vllm/model_executor/models/granitemoe.py +435 -0
  790. vllm/model_executor/models/granitemoeshared.py +339 -0
  791. vllm/model_executor/models/gritlm.py +245 -0
  792. vllm/model_executor/models/grok1.py +560 -0
  793. vllm/model_executor/models/h2ovl.py +542 -0
  794. vllm/model_executor/models/idefics2_vision_model.py +387 -0
  795. vllm/model_executor/models/idefics3.py +767 -0
  796. vllm/model_executor/models/interfaces.py +569 -0
  797. vllm/model_executor/models/interfaces_base.py +163 -0
  798. vllm/model_executor/models/intern_vit.py +476 -0
  799. vllm/model_executor/models/internlm2.py +453 -0
  800. vllm/model_executor/models/internlm2_ve.py +146 -0
  801. vllm/model_executor/models/internvl.py +945 -0
  802. vllm/model_executor/models/jais.py +371 -0
  803. vllm/model_executor/models/jamba.py +590 -0
  804. vllm/model_executor/models/kimi_vl.py +577 -0
  805. vllm/model_executor/models/llama.py +619 -0
  806. vllm/model_executor/models/llama4.py +530 -0
  807. vllm/model_executor/models/llama_eagle.py +152 -0
  808. vllm/model_executor/models/llama_eagle3.py +232 -0
  809. vllm/model_executor/models/llava.py +869 -0
  810. vllm/model_executor/models/llava_next.py +582 -0
  811. vllm/model_executor/models/llava_next_video.py +470 -0
  812. vllm/model_executor/models/llava_onevision.py +954 -0
  813. vllm/model_executor/models/mamba.py +271 -0
  814. vllm/model_executor/models/mamba2.py +302 -0
  815. vllm/model_executor/models/mamba_cache.py +76 -0
  816. vllm/model_executor/models/medusa.py +210 -0
  817. vllm/model_executor/models/minicpm.py +592 -0
  818. vllm/model_executor/models/minicpm3.py +229 -0
  819. vllm/model_executor/models/minicpmo.py +725 -0
  820. vllm/model_executor/models/minicpmv.py +1287 -0
  821. vllm/model_executor/models/minimax_cache.py +35 -0
  822. vllm/model_executor/models/minimax_text_01.py +1261 -0
  823. vllm/model_executor/models/mistral3.py +598 -0
  824. vllm/model_executor/models/mixtral.py +485 -0
  825. vllm/model_executor/models/mixtral_quant.py +447 -0
  826. vllm/model_executor/models/mllama.py +1623 -0
  827. vllm/model_executor/models/mllama4.py +838 -0
  828. vllm/model_executor/models/mlp_speculator.py +205 -0
  829. vllm/model_executor/models/modernbert.py +325 -0
  830. vllm/model_executor/models/module_mapping.py +71 -0
  831. vllm/model_executor/models/molmo.py +1567 -0
  832. vllm/model_executor/models/moonvit.py +628 -0
  833. vllm/model_executor/models/mpt.py +329 -0
  834. vllm/model_executor/models/nemotron.py +506 -0
  835. vllm/model_executor/models/nemotron_nas.py +446 -0
  836. vllm/model_executor/models/nvlm_d.py +212 -0
  837. vllm/model_executor/models/olmo.py +390 -0
  838. vllm/model_executor/models/olmo2.py +412 -0
  839. vllm/model_executor/models/olmoe.py +449 -0
  840. vllm/model_executor/models/opt.py +410 -0
  841. vllm/model_executor/models/orion.py +356 -0
  842. vllm/model_executor/models/paligemma.py +397 -0
  843. vllm/model_executor/models/persimmon.py +342 -0
  844. vllm/model_executor/models/phi.py +354 -0
  845. vllm/model_executor/models/phi3.py +18 -0
  846. vllm/model_executor/models/phi3_small.py +463 -0
  847. vllm/model_executor/models/phi3v.py +722 -0
  848. vllm/model_executor/models/phi4mm.py +1263 -0
  849. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  850. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  851. vllm/model_executor/models/phimoe.py +666 -0
  852. vllm/model_executor/models/pixtral.py +1281 -0
  853. vllm/model_executor/models/plamo2.py +736 -0
  854. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  855. vllm/model_executor/models/qwen.py +360 -0
  856. vllm/model_executor/models/qwen2.py +552 -0
  857. vllm/model_executor/models/qwen2_5_omni_thinker.py +901 -0
  858. vllm/model_executor/models/qwen2_5_vl.py +1136 -0
  859. vllm/model_executor/models/qwen2_audio.py +402 -0
  860. vllm/model_executor/models/qwen2_moe.py +531 -0
  861. vllm/model_executor/models/qwen2_rm.py +130 -0
  862. vllm/model_executor/models/qwen2_vl.py +1409 -0
  863. vllm/model_executor/models/qwen3.py +319 -0
  864. vllm/model_executor/models/qwen3_moe.py +528 -0
  865. vllm/model_executor/models/qwen_vl.py +784 -0
  866. vllm/model_executor/models/registry.py +611 -0
  867. vllm/model_executor/models/roberta.py +332 -0
  868. vllm/model_executor/models/siglip.py +522 -0
  869. vllm/model_executor/models/skyworkr1v.py +949 -0
  870. vllm/model_executor/models/smolvlm.py +51 -0
  871. vllm/model_executor/models/solar.py +504 -0
  872. vllm/model_executor/models/stablelm.py +349 -0
  873. vllm/model_executor/models/starcoder2.py +355 -0
  874. vllm/model_executor/models/telechat2.py +139 -0
  875. vllm/model_executor/models/teleflm.py +78 -0
  876. vllm/model_executor/models/transformers.py +442 -0
  877. vllm/model_executor/models/ultravox.py +655 -0
  878. vllm/model_executor/models/utils.py +714 -0
  879. vllm/model_executor/models/vision.py +149 -0
  880. vllm/model_executor/models/whisper.py +746 -0
  881. vllm/model_executor/models/zamba2.py +1008 -0
  882. vllm/model_executor/parameter.py +458 -0
  883. vllm/model_executor/pooling_metadata.py +71 -0
  884. vllm/model_executor/sampling_metadata.py +596 -0
  885. vllm/model_executor/utils.py +53 -0
  886. vllm/multimodal/__init__.py +31 -0
  887. vllm/multimodal/audio.py +105 -0
  888. vllm/multimodal/base.py +218 -0
  889. vllm/multimodal/hasher.py +103 -0
  890. vllm/multimodal/image.py +77 -0
  891. vllm/multimodal/inputs.py +843 -0
  892. vllm/multimodal/parse.py +454 -0
  893. vllm/multimodal/processing.py +1760 -0
  894. vllm/multimodal/profiling.py +274 -0
  895. vllm/multimodal/registry.py +321 -0
  896. vllm/multimodal/utils.py +386 -0
  897. vllm/multimodal/video.py +166 -0
  898. vllm/outputs.py +521 -0
  899. vllm/platforms/__init__.py +286 -0
  900. vllm/platforms/cpu.py +182 -0
  901. vllm/platforms/cuda.py +463 -0
  902. vllm/platforms/hpu.py +94 -0
  903. vllm/platforms/interface.py +427 -0
  904. vllm/platforms/neuron.py +69 -0
  905. vllm/platforms/rocm.py +346 -0
  906. vllm/platforms/tpu.py +174 -0
  907. vllm/platforms/xpu.py +142 -0
  908. vllm/plugins/__init__.py +82 -0
  909. vllm/pooling_params.py +53 -0
  910. vllm/profiler/__init__.py +7 -0
  911. vllm/profiler/layerwise_profile.py +374 -0
  912. vllm/profiler/utils.py +147 -0
  913. vllm/prompt_adapter/__init__.py +0 -0
  914. vllm/prompt_adapter/layers.py +82 -0
  915. vllm/prompt_adapter/models.py +357 -0
  916. vllm/prompt_adapter/request.py +36 -0
  917. vllm/prompt_adapter/utils.py +97 -0
  918. vllm/prompt_adapter/worker_manager.py +178 -0
  919. vllm/py.typed +2 -0
  920. vllm/reasoning/__init__.py +12 -0
  921. vllm/reasoning/abs_reasoning_parsers.py +189 -0
  922. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  923. vllm/reasoning/granite_reasoning_parser.py +362 -0
  924. vllm/sampling_params.py +598 -0
  925. vllm/scalar_type.py +335 -0
  926. vllm/scripts.py +14 -0
  927. vllm/sequence.py +1486 -0
  928. vllm/spec_decode/__init__.py +0 -0
  929. vllm/spec_decode/batch_expansion.py +505 -0
  930. vllm/spec_decode/draft_model_runner.py +335 -0
  931. vllm/spec_decode/interfaces.py +98 -0
  932. vllm/spec_decode/medusa_worker.py +137 -0
  933. vllm/spec_decode/metrics.py +212 -0
  934. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  935. vllm/spec_decode/mqa_scorer.py +159 -0
  936. vllm/spec_decode/multi_step_worker.py +416 -0
  937. vllm/spec_decode/ngram_worker.py +195 -0
  938. vllm/spec_decode/proposer_worker_base.py +58 -0
  939. vllm/spec_decode/smaller_tp_proposer_worker.py +194 -0
  940. vllm/spec_decode/spec_decode_worker.py +1324 -0
  941. vllm/spec_decode/target_model_runner.py +44 -0
  942. vllm/spec_decode/top1_proposer.py +274 -0
  943. vllm/spec_decode/util.py +276 -0
  944. vllm/test_utils.py +129 -0
  945. vllm/third_party/__init__.py +0 -0
  946. vllm/third_party/pynvml.py +6139 -0
  947. vllm/tracing.py +130 -0
  948. vllm/transformers_utils/__init__.py +19 -0
  949. vllm/transformers_utils/config.py +813 -0
  950. vllm/transformers_utils/configs/__init__.py +52 -0
  951. vllm/transformers_utils/configs/arctic.py +206 -0
  952. vllm/transformers_utils/configs/chatglm.py +71 -0
  953. vllm/transformers_utils/configs/cohere2.py +194 -0
  954. vllm/transformers_utils/configs/dbrx.py +280 -0
  955. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  956. vllm/transformers_utils/configs/eagle.py +65 -0
  957. vllm/transformers_utils/configs/exaone.py +191 -0
  958. vllm/transformers_utils/configs/falcon.py +89 -0
  959. vllm/transformers_utils/configs/h2ovl.py +15 -0
  960. vllm/transformers_utils/configs/internvl.py +53 -0
  961. vllm/transformers_utils/configs/jais.py +237 -0
  962. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  963. vllm/transformers_utils/configs/medusa.py +62 -0
  964. vllm/transformers_utils/configs/mllama.py +30 -0
  965. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  966. vllm/transformers_utils/configs/moonvit.py +32 -0
  967. vllm/transformers_utils/configs/mpt.py +179 -0
  968. vllm/transformers_utils/configs/nemotron.py +204 -0
  969. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  970. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  971. vllm/transformers_utils/configs/solar.py +246 -0
  972. vllm/transformers_utils/configs/telechat2.py +63 -0
  973. vllm/transformers_utils/configs/ultravox.py +107 -0
  974. vllm/transformers_utils/detokenizer.py +167 -0
  975. vllm/transformers_utils/detokenizer_utils.py +188 -0
  976. vllm/transformers_utils/processor.py +210 -0
  977. vllm/transformers_utils/processors/__init__.py +6 -0
  978. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  979. vllm/transformers_utils/s3_utils.py +161 -0
  980. vllm/transformers_utils/tokenizer.py +291 -0
  981. vllm/transformers_utils/tokenizer_base.py +146 -0
  982. vllm/transformers_utils/tokenizer_group.py +110 -0
  983. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  984. vllm/transformers_utils/tokenizers/mistral.py +483 -0
  985. vllm/transformers_utils/utils.py +98 -0
  986. vllm/triton_utils/__init__.py +5 -0
  987. vllm/triton_utils/importing.py +53 -0
  988. vllm/usage/__init__.py +0 -0
  989. vllm/usage/usage_lib.py +255 -0
  990. vllm/utils.py +2692 -0
  991. vllm/v1/__init__.py +0 -0
  992. vllm/v1/attention/__init__.py +0 -0
  993. vllm/v1/attention/backends/__init__.py +0 -0
  994. vllm/v1/attention/backends/flash_attn.py +783 -0
  995. vllm/v1/attention/backends/flashinfer.py +638 -0
  996. vllm/v1/attention/backends/mla/__init__.py +0 -0
  997. vllm/v1/attention/backends/mla/common.py +974 -0
  998. vllm/v1/attention/backends/mla/flashmla.py +149 -0
  999. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1000. vllm/v1/attention/backends/pallas.py +221 -0
  1001. vllm/v1/attention/backends/triton_attn.py +198 -0
  1002. vllm/v1/core/__init__.py +0 -0
  1003. vllm/v1/core/block_pool.py +281 -0
  1004. vllm/v1/core/encoder_cache_manager.py +149 -0
  1005. vllm/v1/core/kv_cache_manager.py +385 -0
  1006. vllm/v1/core/kv_cache_utils.py +744 -0
  1007. vllm/v1/core/sched/__init__.py +0 -0
  1008. vllm/v1/core/sched/interface.py +134 -0
  1009. vllm/v1/core/sched/output.py +126 -0
  1010. vllm/v1/core/sched/scheduler.py +838 -0
  1011. vllm/v1/core/sched/utils.py +22 -0
  1012. vllm/v1/core/specialized_manager.py +161 -0
  1013. vllm/v1/engine/__init__.py +166 -0
  1014. vllm/v1/engine/async_llm.py +532 -0
  1015. vllm/v1/engine/core.py +701 -0
  1016. vllm/v1/engine/core_client.py +942 -0
  1017. vllm/v1/engine/detokenizer.py +260 -0
  1018. vllm/v1/engine/exceptions.py +16 -0
  1019. vllm/v1/engine/llm_engine.py +285 -0
  1020. vllm/v1/engine/logprobs.py +198 -0
  1021. vllm/v1/engine/mm_input_cache.py +82 -0
  1022. vllm/v1/engine/output_processor.py +420 -0
  1023. vllm/v1/engine/parallel_sampling.py +132 -0
  1024. vllm/v1/engine/processor.py +387 -0
  1025. vllm/v1/executor/__init__.py +0 -0
  1026. vllm/v1/executor/abstract.py +112 -0
  1027. vllm/v1/executor/multiproc_executor.py +480 -0
  1028. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1029. vllm/v1/kv_cache_interface.py +166 -0
  1030. vllm/v1/metrics/__init__.py +0 -0
  1031. vllm/v1/metrics/loggers.py +498 -0
  1032. vllm/v1/metrics/stats.py +238 -0
  1033. vllm/v1/outputs.py +111 -0
  1034. vllm/v1/request.py +178 -0
  1035. vllm/v1/sample/__init__.py +0 -0
  1036. vllm/v1/sample/metadata.py +43 -0
  1037. vllm/v1/sample/ops/__init__.py +0 -0
  1038. vllm/v1/sample/ops/bad_words.py +38 -0
  1039. vllm/v1/sample/ops/penalties.py +58 -0
  1040. vllm/v1/sample/ops/topk_topp_sampler.py +315 -0
  1041. vllm/v1/sample/rejection_sampler.py +631 -0
  1042. vllm/v1/sample/sampler.py +270 -0
  1043. vllm/v1/sample/tpu/__init__.py +0 -0
  1044. vllm/v1/sample/tpu/metadata.py +118 -0
  1045. vllm/v1/sample/tpu/sampler.py +154 -0
  1046. vllm/v1/serial_utils.py +274 -0
  1047. vllm/v1/spec_decode/__init__.py +0 -0
  1048. vllm/v1/spec_decode/eagle.py +318 -0
  1049. vllm/v1/spec_decode/metadata.py +61 -0
  1050. vllm/v1/spec_decode/metrics.py +164 -0
  1051. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1052. vllm/v1/spec_decode/utils.py +18 -0
  1053. vllm/v1/stats/__init__.py +0 -0
  1054. vllm/v1/stats/common.py +453 -0
  1055. vllm/v1/structured_output/__init__.py +113 -0
  1056. vllm/v1/structured_output/backend_guidance.py +215 -0
  1057. vllm/v1/structured_output/backend_types.py +96 -0
  1058. vllm/v1/structured_output/backend_xgrammar.py +299 -0
  1059. vllm/v1/structured_output/request.py +84 -0
  1060. vllm/v1/structured_output/utils.py +174 -0
  1061. vllm/v1/utils.py +249 -0
  1062. vllm/v1/worker/__init__.py +0 -0
  1063. vllm/v1/worker/block_table.py +87 -0
  1064. vllm/v1/worker/gpu_input_batch.py +677 -0
  1065. vllm/v1/worker/gpu_model_runner.py +1776 -0
  1066. vllm/v1/worker/gpu_worker.py +349 -0
  1067. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1068. vllm/v1/worker/tpu_model_runner.py +1419 -0
  1069. vllm/v1/worker/tpu_worker.py +260 -0
  1070. vllm/v1/worker/utils.py +74 -0
  1071. vllm/v1/worker/worker_base.py +64 -0
  1072. vllm/version.py +40 -0
  1073. vllm/vllm_flash_attn/.gitkeep +0 -0
  1074. vllm/worker/__init__.py +0 -0
  1075. vllm/worker/cache_engine.py +144 -0
  1076. vllm/worker/cpu_enc_dec_model_runner.py +323 -0
  1077. vllm/worker/cpu_model_runner.py +668 -0
  1078. vllm/worker/cpu_pooling_model_runner.py +122 -0
  1079. vllm/worker/cpu_worker.py +400 -0
  1080. vllm/worker/enc_dec_model_runner.py +542 -0
  1081. vllm/worker/hpu_model_runner.py +2221 -0
  1082. vllm/worker/hpu_worker.py +483 -0
  1083. vllm/worker/model_runner.py +2056 -0
  1084. vllm/worker/model_runner_base.py +281 -0
  1085. vllm/worker/multi_step_hpu_worker.py +122 -0
  1086. vllm/worker/multi_step_model_runner.py +908 -0
  1087. vllm/worker/multi_step_tpu_worker.py +107 -0
  1088. vllm/worker/multi_step_worker.py +196 -0
  1089. vllm/worker/neuron_model_runner.py +336 -0
  1090. vllm/worker/neuron_worker.py +138 -0
  1091. vllm/worker/pooling_model_runner.py +200 -0
  1092. vllm/worker/tpu_model_runner.py +908 -0
  1093. vllm/worker/tpu_worker.py +332 -0
  1094. vllm/worker/utils.py +52 -0
  1095. vllm/worker/worker.py +570 -0
  1096. vllm/worker/worker_base.py +644 -0
  1097. vllm/worker/xpu_model_runner.py +603 -0
  1098. vllm/worker/xpu_worker.py +185 -0
  1099. vllm_cpu-0.8.5.post2.dist-info/METADATA +309 -0
  1100. vllm_cpu-0.8.5.post2.dist-info/RECORD +1103 -0
  1101. vllm_cpu-0.8.5.post2.dist-info/WHEEL +5 -0
  1102. vllm_cpu-0.8.5.post2.dist-info/entry_points.txt +2 -0
  1103. vllm_cpu-0.8.5.post2.dist-info/top_level.txt +1 -0
vllm/core/scheduler.py ADDED
@@ -0,0 +1,2060 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import enum
4
+ import os
5
+ import random
6
+ import time
7
+ from collections import deque
8
+ from dataclasses import dataclass, field
9
+ from typing import Callable, Deque, Dict, Iterable, List, Optional
10
+ from typing import Sequence as GenericSequence
11
+ from typing import Set, Tuple, Union
12
+
13
+ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
14
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
15
+ from vllm.logger import init_logger
16
+ from vllm.lora.request import LoRARequest
17
+ from vllm.prompt_adapter.request import PromptAdapterRequest
18
+ from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
19
+ SequenceGroupBase, SequenceGroupMetadata,
20
+ SequenceGroupMetadataDelta, SequenceStage,
21
+ SequenceStatus)
22
+ from vllm.utils import Device, PyObjectCache
23
+
24
+ logger = init_logger(__name__)
25
+
26
+ # Test-only. If configured, decode is preempted with
27
+ # ARTIFICIAL_PREEMPTION_PROB% probability.
28
+ ENABLE_ARTIFICIAL_PREEMPT = bool(
29
+ os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa
30
+ ARTIFICIAL_PREEMPTION_PROB = 0.5
31
+ ARTIFICIAL_PREEMPTION_MAX_CNT = 500
32
+
33
+
34
+ class PreemptionMode(enum.Enum):
35
+ """Preemption modes.
36
+
37
+ 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
38
+ and swap them back in when the sequences are resumed.
39
+ 2. Recomputation: Discard the blocks of the preempted sequences and
40
+ recompute them when the sequences are resumed, treating the sequences as
41
+ new prompts.
42
+ """
43
+
44
+ SWAP = enum.auto()
45
+ RECOMPUTE = enum.auto()
46
+
47
+
48
+ @dataclass
49
+ class SchedulingBudget:
50
+ """The available slots for scheduling.
51
+
52
+ TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
53
+ budget update from the same request_id. It is because in normal scheduling
54
+ path, we update RUNNING num_seqs ahead of time, meaning it could be
55
+ updated more than once when scheduling RUNNING requests. Since this won't
56
+ happen if we only have chunked prefill scheduling, we can remove this
57
+ feature from the API when chunked prefill is enabled by default.
58
+ """
59
+
60
+ token_budget: int
61
+ max_num_seqs: int
62
+ _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
63
+ _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
64
+ # Number of cached tokens in the batch.
65
+ _num_cached_tokens: int = 0
66
+ # Number of actual non-cached tokens in the batch.
67
+ _num_batched_tokens: int = 0
68
+ _num_curr_seqs: int = 0
69
+
70
+ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
71
+ # We allow num_new_tokens to be 0 when the entire sequence has
72
+ # been cached.
73
+ assert num_new_tokens >= 0
74
+ assert num_new_seqs != 0
75
+ return (self.num_batched_tokens + num_new_tokens <= self.token_budget
76
+ and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
77
+
78
+ def remaining_token_budget(self):
79
+ return self.token_budget - self.num_batched_tokens
80
+
81
+ def add_num_batched_tokens(self,
82
+ req_id: str,
83
+ num_batched_tokens: int,
84
+ num_cached_tokens: int = 0):
85
+ if req_id in self._request_ids_num_batched_tokens:
86
+ return
87
+ assert num_cached_tokens >= 0
88
+ assert num_batched_tokens >= 0
89
+
90
+ self._request_ids_num_batched_tokens.add(req_id)
91
+ self._num_batched_tokens += num_batched_tokens
92
+ self._num_cached_tokens += num_cached_tokens
93
+
94
+ def subtract_num_batched_tokens(self, req_id: str,
95
+ num_batched_tokens: int):
96
+ if req_id in self._request_ids_num_batched_tokens:
97
+ self._request_ids_num_batched_tokens.remove(req_id)
98
+ self._num_batched_tokens -= num_batched_tokens
99
+
100
+ def add_num_seqs(self, req_id: str, num_curr_seqs: int):
101
+ if req_id in self._request_ids_num_curr_seqs:
102
+ return
103
+
104
+ self._request_ids_num_curr_seqs.add(req_id)
105
+ self._num_curr_seqs += num_curr_seqs
106
+
107
+ def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
108
+ if req_id in self._request_ids_num_curr_seqs:
109
+ self._request_ids_num_curr_seqs.remove(req_id)
110
+ self._num_curr_seqs -= num_curr_seqs
111
+
112
+ @property
113
+ def num_batched_tokens(self):
114
+ return self._num_batched_tokens
115
+
116
+ @property
117
+ def num_curr_seqs(self):
118
+ return self._num_curr_seqs
119
+
120
+ @property
121
+ def num_cached_tokens(self):
122
+ return self._num_cached_tokens
123
+
124
+
125
+ @dataclass
126
+ class ScheduledSequenceGroup:
127
+ # A sequence group that's scheduled.
128
+ seq_group: SequenceGroup
129
+ # The total chunk size (number of tokens) to process for next iteration.
130
+ # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
131
+ # chunked, it can be smaller than that.
132
+ token_chunk_size: int
133
+
134
+
135
+ @dataclass
136
+ class SchedulerOutputs:
137
+ """The scheduling decision made from a scheduler."""
138
+
139
+ # Scheduled sequence groups.
140
+ scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
141
+ # Number of prefill groups scheduled.
142
+ num_prefill_groups: int
143
+ # Total number of batched tokens.
144
+ num_batched_tokens: int
145
+ # Blocks to swap in. List of CPU -> GPU block number.
146
+ blocks_to_swap_in: List[Tuple[int, int]]
147
+ # Blocks to swap out. List of GPU -> CPU block number.
148
+ blocks_to_swap_out: List[Tuple[int, int]]
149
+ # Blocks to copy. Source to dest block.
150
+ blocks_to_copy: List[Tuple[int, int]]
151
+ # Sequence groups that are going to be ignored.
152
+ ignored_seq_groups: List[SequenceGroup]
153
+ # The number of slots for lookahead decoding.
154
+ num_lookahead_slots: int
155
+ # The number of requests in the running queue
156
+ running_queue_size: int
157
+ preempted: int
158
+
159
+ def __post_init__(self):
160
+ # Swap in and swap out should never happen at the same time.
161
+ assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
162
+
163
+ self.num_loras: int = len(self.lora_requests)
164
+ if self.num_loras > 0:
165
+ self._sort_by_lora_ids()
166
+
167
+ self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
168
+
169
+ def is_empty(self) -> bool:
170
+ # NOTE: We do not consider the ignored sequence groups.
171
+ return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
172
+ and not self.blocks_to_swap_out and not self.blocks_to_copy)
173
+
174
+ def _sort_by_lora_ids(self):
175
+ assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
176
+
177
+ def key_fn(group: ScheduledSequenceGroup):
178
+ key = (group.seq_group.lora_int_id, group.seq_group.request_id)
179
+ if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
180
+ # Sort sequence groups so that all prefills come before all
181
+ # decodes as required by chunked prefill.
182
+ return (not group.seq_group.is_prefill(), *key)
183
+ return key
184
+
185
+ self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
186
+ key=key_fn)
187
+
188
+ @property
189
+ def lora_requests(self) -> Set[LoRARequest]:
190
+ return {
191
+ g.seq_group.lora_request
192
+ for g in self.scheduled_seq_groups
193
+ if g.seq_group.lora_request is not None
194
+ }
195
+
196
+ @property
197
+ def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
198
+ return {
199
+ g.seq_group.prompt_adapter_request
200
+ for g in self.scheduled_seq_groups
201
+ if g.seq_group.prompt_adapter_request is not None
202
+ }
203
+
204
+
205
+ @dataclass
206
+ class SchedulerRunningOutputs:
207
+ """The requests that are scheduled from a running queue.
208
+
209
+ Could contain prefill (prefill that's chunked) or decodes. If there's not
210
+ enough memory, it can be preempted (for recompute) or swapped out.
211
+ """
212
+
213
+ # Selected sequences that are running and in a decoding phase.
214
+ decode_seq_groups: List[ScheduledSequenceGroup]
215
+ # Selected sequences that are running and in a prefill phase.
216
+ # I.e., it means the prefill has been chunked.
217
+ prefill_seq_groups: List[ScheduledSequenceGroup]
218
+ # The preempted sequences.
219
+ preempted: List[SequenceGroup]
220
+ # Sequences that are swapped out.
221
+ swapped_out: List[SequenceGroup]
222
+ # The blocks to swap out.
223
+ blocks_to_swap_out: List[Tuple[int, int]]
224
+ # The blocks to copy.
225
+ blocks_to_copy: List[Tuple[int, int]]
226
+ # The number of slots for lookahead decoding.
227
+ num_lookahead_slots: int
228
+
229
+ # Optimization for fast-access to seq_group lists
230
+ decode_seq_groups_list: List[SequenceGroup]
231
+ prefill_seq_groups_list: List[SequenceGroup]
232
+
233
+ @classmethod
234
+ def create_empty(cls) -> "SchedulerRunningOutputs":
235
+ return SchedulerRunningOutputs(
236
+ decode_seq_groups=[],
237
+ prefill_seq_groups=[],
238
+ preempted=[],
239
+ swapped_out=[],
240
+ blocks_to_swap_out=[],
241
+ blocks_to_copy=[],
242
+ num_lookahead_slots=0,
243
+ decode_seq_groups_list=[],
244
+ prefill_seq_groups_list=[],
245
+ )
246
+
247
+
248
+ @dataclass
249
+ class SchedulerSwappedInOutputs:
250
+ """The requests that are scheduled from a swap queue.
251
+
252
+ Could contain prefill (prefill that's chunked) or decodes.
253
+ """
254
+
255
+ # Selected sequences that are going to be swapped in and is in a
256
+ # decoding phase.
257
+ decode_seq_groups: List[ScheduledSequenceGroup]
258
+ # Selected sequences that are going to be swapped in and in a prefill
259
+ # phase. I.e., it means the prefill has been chunked.
260
+ prefill_seq_groups: List[ScheduledSequenceGroup]
261
+ # The blocks to swap in.
262
+ blocks_to_swap_in: List[Tuple[int, int]]
263
+ # The blocks to copy.
264
+ blocks_to_copy: List[Tuple[int, int]]
265
+ # The number of slots for lookahead decoding.
266
+ num_lookahead_slots: int
267
+ # Infeasible sequence groups.
268
+ infeasible_seq_groups: List[SequenceGroup]
269
+
270
+ @classmethod
271
+ def create_empty(cls) -> "SchedulerSwappedInOutputs":
272
+ return SchedulerSwappedInOutputs(
273
+ decode_seq_groups=[],
274
+ prefill_seq_groups=[],
275
+ blocks_to_swap_in=[],
276
+ blocks_to_copy=[],
277
+ num_lookahead_slots=0,
278
+ infeasible_seq_groups=[],
279
+ )
280
+
281
+
282
+ @dataclass
283
+ class SchedulerPrefillOutputs:
284
+ """The requests that are scheduled from a waiting queue.
285
+
286
+ Could contain a fresh prefill requests or preempted requests that need
287
+ to be recomputed from scratch.
288
+ """
289
+
290
+ # Selected sequences for prefill.
291
+ seq_groups: List[ScheduledSequenceGroup]
292
+ # Ignored sequence groups.
293
+ ignored_seq_groups: List[SequenceGroup]
294
+ num_lookahead_slots: int
295
+
296
+ @classmethod
297
+ def create_empty(cls) -> "SchedulerPrefillOutputs":
298
+ return SchedulerPrefillOutputs(
299
+ seq_groups=[],
300
+ ignored_seq_groups=[],
301
+ num_lookahead_slots=0,
302
+ )
303
+
304
+
305
+ def seq_group_metadata_builder():
306
+ return SequenceGroupMetadata(request_id="",
307
+ is_prompt=False,
308
+ seq_data={},
309
+ sampling_params=None,
310
+ block_tables={})
311
+
312
+
313
+ def scheduler_running_outputs_builder():
314
+ return SchedulerRunningOutputs(decode_seq_groups=[],
315
+ prefill_seq_groups=[],
316
+ preempted=[],
317
+ swapped_out=[],
318
+ blocks_to_swap_out=[],
319
+ blocks_to_copy=[],
320
+ num_lookahead_slots=0,
321
+ prefill_seq_groups_list=[],
322
+ decode_seq_groups_list=[])
323
+
324
+
325
+ def scheduled_seq_group_builder():
326
+ return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
327
+ token_chunk_size=0)
328
+ # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
329
+
330
+
331
+ @dataclass
332
+ class PartialPrefillMetadata:
333
+ """Holds information about the partial prefills that are currently running
334
+ during a single iteration of the Scheduler.
335
+ When chunked prefill is enabled, we allow a certain number of seqs to be
336
+ partially prefilled during each iteration. Having multiple partial prefills
337
+ in flight allows us to minimize TTFT and avoid decode starvation in cases
338
+ where a single sequence group with a very large prompt blocks the queue for
339
+ too many iterations.
340
+ The number of long prefill requests is limited so that smaller
341
+ requests may jump the queue in front of them and get to the decode
342
+ phase faster.
343
+ """
344
+
345
+ # A minimum bound on the total number of prefills to be scheduled during
346
+ # this iteration
347
+ schedulable_prefills: int
348
+
349
+ # The number of long prefill requests currently running
350
+ long_prefills: int
351
+
352
+ scheduler_config: SchedulerConfig
353
+
354
+ def can_schedule(self, seq_group: SequenceGroup) -> bool:
355
+ """When concurrent partial prefills are enabled,
356
+ we limit the number of long requests and only accept
357
+ shorter requests from the queue while running them
358
+ concurrently"""
359
+ return not (seq_group.first_seq.get_num_new_tokens()
360
+ > self.scheduler_config.long_prefill_token_threshold
361
+ and self.long_prefills
362
+ >= self.scheduler_config.max_long_partial_prefills
363
+ and self.scheduler_config.max_num_partial_prefills > 1)
364
+
365
+ def maybe_increment_partial_prefills(self,
366
+ seq_group: SequenceGroup) -> None:
367
+ # When a new prefill is scheduled, we need to know if it is a
368
+ # long request
369
+ if (seq_group.first_seq.get_num_new_tokens()
370
+ > self.scheduler_config.long_prefill_token_threshold):
371
+ self.long_prefills += 1
372
+
373
+ @classmethod
374
+ def from_queues(
375
+ cls,
376
+ running: Deque[SequenceGroup],
377
+ waiting: Deque[SequenceGroup],
378
+ scheduler_config: SchedulerConfig,
379
+ ) -> "PartialPrefillMetadata":
380
+ """Create a PartialPrefillMetadata object from the current state of
381
+ the scheduler's queues.
382
+ This accounts for the currently running prefill requests, and peeks into
383
+ the waiting queue to see if there are more prefills to potentially be
384
+ scheduled during this iteration."""
385
+ prefills = 0
386
+ long_prefills = 0
387
+
388
+ waiting_long_prefills = 0
389
+
390
+ for sg in running:
391
+ if sg.first_seq.data.stage == SequenceStage.PREFILL:
392
+ prefills += 1
393
+ if (sg.first_seq.get_num_new_tokens()
394
+ > scheduler_config.long_prefill_token_threshold):
395
+ long_prefills += 1
396
+
397
+ for sg in waiting:
398
+ # Don't bother looping through the rest of the queue if we know
399
+ # there are already at
400
+ # least max_partial_prefills requests to fill
401
+ if prefills >= scheduler_config.max_num_partial_prefills:
402
+ break
403
+
404
+ # Don't count long requests from the waiting queue if we aren't
405
+ # going to schedule them anyway
406
+ if (sg.first_seq.get_num_new_tokens()
407
+ > scheduler_config.long_prefill_token_threshold):
408
+ if (long_prefills + waiting_long_prefills
409
+ >= scheduler_config.max_long_partial_prefills):
410
+ continue
411
+ waiting_long_prefills += 1
412
+ prefills += 1
413
+
414
+ # NB: long_prefills and waiting_long_prefills are tracked separately.
415
+ # We don't account for the waiting requests here because we need to use
416
+ # this metadata to track how many have actually been scheduled.
417
+ return PartialPrefillMetadata(
418
+ schedulable_prefills=min(
419
+ prefills, scheduler_config.max_num_partial_prefills),
420
+ long_prefills=long_prefills,
421
+ scheduler_config=scheduler_config,
422
+ )
423
+
424
+
425
+ class Scheduler:
426
+
427
+ def __init__(
428
+ self,
429
+ scheduler_config: SchedulerConfig,
430
+ cache_config: CacheConfig,
431
+ lora_config: Optional[LoRAConfig],
432
+ pipeline_parallel_size: int = 1,
433
+ output_proc_callback: Optional[Callable] = None,
434
+ ) -> None:
435
+ self.scheduler_config = scheduler_config
436
+ self.cache_config = cache_config
437
+ # Note for LoRA scheduling: the current policy is extremely
438
+ # simple and NOT fair. It can lead to starvation of some
439
+ # LoRAs. This should be improved in the future.
440
+ self.lora_config = lora_config
441
+
442
+ version = "selfattn"
443
+ if (self.scheduler_config.runner_type == "pooling"
444
+ or self.cache_config.is_attention_free):
445
+ version = "placeholder"
446
+
447
+ BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
448
+ version)
449
+
450
+ num_gpu_blocks = cache_config.num_gpu_blocks
451
+ if num_gpu_blocks:
452
+ num_gpu_blocks //= pipeline_parallel_size
453
+
454
+ num_cpu_blocks = cache_config.num_cpu_blocks
455
+ if num_cpu_blocks:
456
+ num_cpu_blocks //= pipeline_parallel_size
457
+
458
+ # Create the block space manager.
459
+ self.block_manager = BlockSpaceManagerImpl(
460
+ block_size=self.cache_config.block_size,
461
+ num_gpu_blocks=num_gpu_blocks,
462
+ num_cpu_blocks=num_cpu_blocks,
463
+ sliding_window=self.cache_config.sliding_window,
464
+ enable_caching=self.cache_config.enable_prefix_caching,
465
+ )
466
+
467
+ # Sequence groups in the WAITING state.
468
+ # Contain new prefill or preempted requests.
469
+ self.waiting: Deque[SequenceGroup] = deque()
470
+ # Sequence groups in the RUNNING state.
471
+ # Contain decode requests.
472
+ self.running: Deque[SequenceGroup] = deque()
473
+ # Sequence groups in the SWAPPED state.
474
+ # Contain decode requests that are swapped out.
475
+ self.swapped: Deque[SequenceGroup] = deque()
476
+ # Sequence groups finished requests ids since last step iteration.
477
+ # It lets the model know that any state associated with these requests
478
+ # can and must be released after the current step.
479
+ # This is used to evict the finished requests from the Mamba cache.
480
+ self._finished_requests_ids: List[str] = list()
481
+ # Time at previous scheduling step
482
+ self.prev_time = 0.0
483
+ # Did we schedule a prompt at previous step?
484
+ self.prev_prompt = False
485
+ # Latency of the last prompt step
486
+ self.last_prompt_latency = 0.0
487
+ # preemption mode, RECOMPUTE or SWAP
488
+ self.user_specified_preemption_mode = scheduler_config.preemption_mode
489
+
490
+ # The following field is test-only. It is used to inject artificial
491
+ # preemption.
492
+ self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
493
+ self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
494
+ if self.enable_artificial_preemption
495
+ else 0)
496
+ self.num_cumulative_preemption: int = 0
497
+
498
+ # Used to cache python objects
499
+ self._seq_group_metadata_cache: List[PyObjectCache] = []
500
+ self._scheduler_running_outputs_cache: List[PyObjectCache] = []
501
+ self._scheduled_seq_group_cache: List[PyObjectCache] = []
502
+
503
+ # For async output processing, we need to swap cache buffers between
504
+ # iterations. I.e. since the output processing is lagged one step,
505
+ # we cannot reuse the cached objects immediately when the schedule()
506
+ # is called again, but only when schedule() is called the second time.
507
+ self.output_proc_callback = output_proc_callback
508
+ self.use_async_output_proc = self.output_proc_callback is not None
509
+ self.num_cache_iters = 2 if self.use_async_output_proc else 1
510
+
511
+ self.cache_id = 0
512
+ for i in range(self.num_cache_iters):
513
+ self._seq_group_metadata_cache.append(
514
+ PyObjectCache(seq_group_metadata_builder))
515
+ self._scheduler_running_outputs_cache.append(
516
+ PyObjectCache(scheduler_running_outputs_builder))
517
+ self._scheduled_seq_group_cache.append(
518
+ PyObjectCache(scheduled_seq_group_builder))
519
+
520
+ # For async postprocessor, the extra decode run cannot be done
521
+ # when the request reaches max_model_len. In this case, the request
522
+ # will be stopped during schedule() call and added to this stop list
523
+ # for processing and deallocation by the free_finished_seq_groups()
524
+ self._async_stopped: List[SequenceGroup] = []
525
+
526
+ # List with the chunk sizes to hand out to each sequence depending
527
+ # on how many partial prefills are running. This is slightly faster than
528
+ # running an integer division every time a prefill is scheduled.
529
+ # This splits the budget evenly among all prefills.
530
+ self.partial_prefill_budget_lookup_list = [0] * (
531
+ self.scheduler_config.max_num_partial_prefills + 1)
532
+ self.partial_prefill_budget_lookup_list[0] = (
533
+ scheduler_config.max_num_batched_tokens)
534
+ for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
535
+ self.partial_prefill_budget_lookup_list[i] = (
536
+ scheduler_config.max_num_batched_tokens // i)
537
+
538
+ @property
539
+ def next_cache_id(self):
540
+ return (self.cache_id + 1) % self.num_cache_iters
541
+
542
+ @property
543
+ def lora_enabled(self) -> bool:
544
+ return bool(self.lora_config)
545
+
546
+ @property
547
+ def num_decoding_tokens_per_seq(self) -> int:
548
+ """The number of new tokens."""
549
+ return 1
550
+
551
+ def add_seq_group(self, seq_group: SequenceGroup) -> None:
552
+ # Add sequence groups to the waiting queue.
553
+ self.waiting.append(seq_group)
554
+
555
+ def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
556
+ # Add sequence groups to the running queue.
557
+ # Only for testing purposes.
558
+ self.running.append(seq_group)
559
+
560
+ def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
561
+ # Add sequence groups to the swapped queue.
562
+ # Only for testing purposes.
563
+ self.swapped.append(seq_group)
564
+
565
+ def abort_seq_group(
566
+ self,
567
+ request_id: Union[str, Iterable[str]],
568
+ seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
569
+ ) -> None:
570
+ """Aborts a sequence group with the given ID.
571
+
572
+ Check if the sequence group with the given ID
573
+ is present in any of the state queue.
574
+ If present, remove the sequence group from the state queue.
575
+ Also, if any of the sequences in the sequence group is not finished,
576
+ free the sequence with status `FINISHED_ABORTED`.
577
+ Otherwise, do nothing.
578
+
579
+ Args:
580
+ request_id: The ID(s) of the sequence group to abort.
581
+ seq_id_to_seq_group: helper for groups with n>1
582
+ """
583
+ if isinstance(request_id, str):
584
+ request_id = (request_id, )
585
+ request_ids = set(request_id)
586
+ seq_id_to_seq_group = seq_id_to_seq_group or {}
587
+ for state_queue in [self.waiting, self.running, self.swapped]:
588
+ aborted_groups: List[SequenceGroup] = []
589
+ for seq_group in state_queue:
590
+ # When n>1, seq_group.request_id looks like
591
+ # foo_parallel_sample_0, while request_ids is just foo, and we
592
+ # should resolve it as real_request_id to match.
593
+ if seq_group.request_id in seq_id_to_seq_group:
594
+ real_request_id = seq_id_to_seq_group[
595
+ seq_group.request_id].group_id
596
+ else:
597
+ real_request_id = seq_group.request_id
598
+ if real_request_id in request_ids:
599
+ # Appending aborted group into pending list.
600
+ aborted_groups.append(seq_group)
601
+ # We can't remove real_request_id in request_ids here,
602
+ # because there may be other seq groups sharing the same
603
+ # real_request_id
604
+ for aborted_group in aborted_groups:
605
+ # Remove the sequence group from the state queue.
606
+ state_queue.remove(aborted_group)
607
+ # Remove the aborted request from the Mamba cache.
608
+ self._finished_requests_ids.append(aborted_group.request_id)
609
+ for seq in aborted_group.get_seqs():
610
+ if seq.is_finished():
611
+ continue
612
+ seq.status = SequenceStatus.FINISHED_ABORTED
613
+ self.free_seq(seq)
614
+ if aborted_group.request_id in seq_id_to_seq_group:
615
+ del seq_id_to_seq_group[aborted_group.request_id]
616
+
617
+ self._free_seq_group_cross_attn_blocks(aborted_group)
618
+
619
+ def _free_seq_group_cross_attn_blocks(
620
+ self,
621
+ seq_group: SequenceGroup,
622
+ ) -> None:
623
+ """
624
+ Free a sequence group from a cross-attention block table.
625
+ Has no effect on decoder-only models.
626
+ """
627
+ if seq_group.is_encoder_decoder():
628
+ self.block_manager.free_cross(seq_group)
629
+
630
+ def has_unfinished_seqs(self) -> bool:
631
+ return (len(self.waiting) != 0 or len(self.running) != 0
632
+ or len(self.swapped) != 0)
633
+
634
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
635
+ return self.block_manager.get_prefix_cache_hit_rate(device)
636
+
637
+ def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
638
+ return self.block_manager.reset_prefix_cache(device)
639
+
640
+ def get_num_unfinished_seq_groups(self) -> int:
641
+ return len(self.waiting) + len(self.running) + len(self.swapped)
642
+
643
+ def get_and_reset_finished_requests_ids(self) -> List[str]:
644
+ """Flushes the list of request ids of previously finished seq_groups."""
645
+ finished_requests_ids = self._finished_requests_ids
646
+ self._finished_requests_ids = list()
647
+ return finished_requests_ids
648
+
649
+ def _schedule_running(
650
+ self,
651
+ budget: SchedulingBudget,
652
+ curr_loras: Optional[Set[int]],
653
+ enable_chunking: bool = False,
654
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
655
+ ) -> SchedulerRunningOutputs:
656
+ """Schedule sequence groups that are running.
657
+
658
+ Running queue should include decode and chunked prefill requests.
659
+
660
+ Args:
661
+ budget: The scheduling budget. The argument is in-place updated
662
+ when any decodes are preempted.
663
+ curr_loras: Currently batched lora request ids. The argument is
664
+ in-place updated when any decodes are preempted.
665
+ enable_chunking: If True, seq group can be chunked and only a
666
+ chunked number of tokens are scheduled if
667
+ `budget.num_batched_tokens` has not enough capacity to schedule
668
+ all tokens.
669
+ partial_prefill_metadata: information about the partial prefills
670
+ that are currently running
671
+
672
+ Returns:
673
+ SchedulerRunningOutputs.
674
+ """
675
+ ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
676
+ self.cache_id].get_object()
677
+ ret.blocks_to_swap_out.clear()
678
+ ret.blocks_to_copy.clear()
679
+ ret.decode_seq_groups.clear()
680
+ ret.prefill_seq_groups.clear()
681
+ ret.preempted.clear()
682
+ ret.swapped_out.clear()
683
+
684
+ ret.num_lookahead_slots = self._get_num_lookahead_slots(
685
+ is_prefill=False, enable_chunking=enable_chunking)
686
+
687
+ ret.decode_seq_groups_list.clear()
688
+ ret.prefill_seq_groups_list.clear()
689
+
690
+ # Blocks that need to be swapped or copied before model execution.
691
+ blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
692
+ blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
693
+
694
+ decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
695
+ prefill_seq_groups: List[
696
+ ScheduledSequenceGroup] = ret.prefill_seq_groups
697
+ preempted: List[SequenceGroup] = ret.preempted
698
+ swapped_out: List[SequenceGroup] = ret.swapped_out
699
+
700
+ running_queue = self.running
701
+ assert len(self._async_stopped) == 0
702
+ while running_queue:
703
+ seq_group = running_queue[0]
704
+ # We discard the cached tokens info here because we don't need it
705
+ # for running sequence:
706
+ # 1. If a sequence is running with chunked prefill, the cached
707
+ # tokens info was already used for the first prefill.
708
+ # 2. If a sequence is running with non-chunked prefill, then
709
+ # there it's a decoding sequence, and the cached tokens info is
710
+ # irrelevant.
711
+ num_uncached_new_tokens, _ = \
712
+ self._get_num_new_uncached_and_cached_tokens(
713
+ seq_group,
714
+ SequenceStatus.RUNNING,
715
+ enable_chunking,
716
+ budget,
717
+ partial_prefill_metadata,
718
+ )
719
+
720
+ num_running_tokens = num_uncached_new_tokens
721
+ if num_running_tokens == 0:
722
+ # No budget => Stop
723
+ break
724
+
725
+ running_queue.popleft()
726
+
727
+ # With async postprocessor, an extra decode run is done
728
+ # to process the final tokens. The check below avoids this extra
729
+ # decode run when the model max len is reached, in order to avoid
730
+ # a memory overflow.
731
+ if (self.use_async_output_proc and seq_group.seqs[0].get_len()
732
+ > self.scheduler_config.max_model_len):
733
+ self._async_stopped.append(seq_group)
734
+ continue
735
+
736
+ # NOTE(woosuk): Preemption happens only when there is no available
737
+ # slot to keep all the sequence groups in the RUNNING state.
738
+ while not self._can_append_slots(seq_group, enable_chunking):
739
+ budget.subtract_num_batched_tokens(seq_group.request_id,
740
+ num_running_tokens)
741
+ num_running_seqs = seq_group.get_max_num_running_seqs()
742
+ budget.subtract_num_seqs(seq_group.request_id,
743
+ num_running_seqs)
744
+
745
+ if (curr_loras is not None and seq_group.lora_int_id > 0
746
+ and seq_group.lora_int_id in curr_loras):
747
+ curr_loras.remove(seq_group.lora_int_id)
748
+
749
+ # Determine victim sequence
750
+ cont_loop = True
751
+ if running_queue:
752
+ # Preempt the lowest-priority sequence group.
753
+ victim_seq_group = running_queue.pop()
754
+ else:
755
+ # No other sequence group can be preempted.
756
+ # Preempt the current sequence group.
757
+ # Note: This is also where we stop this loop
758
+ # (since there is nothing else to preempt)
759
+ victim_seq_group = seq_group
760
+ cont_loop = False
761
+
762
+ # With async postprocessor, before preempting a sequence
763
+ # we need to ensure it has no pending async postprocessor
764
+ do_preempt = True
765
+ if self.use_async_output_proc:
766
+ assert self.output_proc_callback is not None
767
+ self.output_proc_callback(
768
+ request_id=victim_seq_group.request_id)
769
+
770
+ # It may be that the async pending "victim_seq_group"
771
+ # becomes finished, in which case we simply free it.
772
+ if victim_seq_group.is_finished():
773
+ self._free_finished_seq_group(victim_seq_group)
774
+ do_preempt = False
775
+
776
+ # Do preemption
777
+ if do_preempt:
778
+ preempted_mode = self._preempt(victim_seq_group,
779
+ blocks_to_swap_out)
780
+ if preempted_mode == PreemptionMode.RECOMPUTE:
781
+ preempted.append(victim_seq_group)
782
+ else:
783
+ swapped_out.append(victim_seq_group)
784
+
785
+ if not cont_loop:
786
+ break
787
+ else:
788
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
789
+ is_prefill = seq_group.is_prefill()
790
+
791
+ scheduled_seq_group: ScheduledSequenceGroup = (
792
+ self._scheduled_seq_group_cache[
793
+ self.cache_id].get_object())
794
+ scheduled_seq_group.seq_group = seq_group
795
+ if is_prefill:
796
+ scheduled_seq_group.token_chunk_size = num_running_tokens
797
+ prefill_seq_groups.append(scheduled_seq_group)
798
+ ret.prefill_seq_groups_list.append(seq_group)
799
+ else:
800
+ scheduled_seq_group.token_chunk_size = 1
801
+ decode_seq_groups.append(scheduled_seq_group)
802
+ ret.decode_seq_groups_list.append(seq_group)
803
+
804
+ budget.add_num_batched_tokens(seq_group.request_id,
805
+ num_running_tokens)
806
+ # OPTIMIZATION: Note that get_max_num_running_seqs is
807
+ # expensive. For the default scheduling chase where
808
+ # enable_chunking is False, num_seqs are updated before running
809
+ # this method, so we don't have to update it again here.
810
+ if enable_chunking:
811
+ num_running_seqs = seq_group.get_max_num_running_seqs()
812
+ budget.add_num_seqs(seq_group.request_id, num_running_seqs)
813
+ if curr_loras is not None and seq_group.lora_int_id > 0:
814
+ curr_loras.add(seq_group.lora_int_id)
815
+
816
+ self._scheduler_running_outputs_cache[self.next_cache_id].reset()
817
+ self._scheduled_seq_group_cache[self.next_cache_id].reset()
818
+
819
+ return ret
820
+
821
+ def _schedule_swapped(
822
+ self,
823
+ budget: SchedulingBudget,
824
+ curr_loras: Optional[Set[int]],
825
+ enable_chunking: bool = False,
826
+ ) -> SchedulerSwappedInOutputs:
827
+ """Schedule sequence groups that are swapped out.
828
+
829
+ It schedules swapped requests as long as it fits `budget` and
830
+ curr_loras <= max_lora from the scheduling config. The input arguments
831
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
832
+
833
+ Args:
834
+ budget: The scheduling budget. The argument is in-place updated
835
+ when any requests are swapped in.
836
+ curr_loras: Currently batched lora request ids. The argument is
837
+ in-place updated when any requests are swapped in.
838
+ enable_chunking: If True, seq group can be chunked and only a
839
+ chunked number of tokens are scheduled if
840
+ `budget.num_batched_tokens` has not enough capacity to schedule
841
+ all tokens.
842
+
843
+ Returns:
844
+ SchedulerSwappedInOutputs.
845
+ """
846
+ # Blocks that need to be swapped or copied before model execution.
847
+ blocks_to_swap_in: List[Tuple[int, int]] = []
848
+ blocks_to_copy: List[Tuple[int, int]] = []
849
+ decode_seq_groups: List[ScheduledSequenceGroup] = []
850
+ prefill_seq_groups: List[ScheduledSequenceGroup] = []
851
+ infeasible_seq_groups: List[SequenceGroup] = []
852
+
853
+ swapped_queue = self.swapped
854
+
855
+ leftover_swapped: Deque[SequenceGroup] = deque()
856
+ while swapped_queue:
857
+ seq_group = swapped_queue[0]
858
+
859
+ # If the sequence group cannot be swapped in, stop.
860
+ is_prefill = seq_group.is_prefill()
861
+ alloc_status = self.block_manager.can_swap_in(
862
+ seq_group,
863
+ self._get_num_lookahead_slots(is_prefill, enable_chunking))
864
+ if alloc_status == AllocStatus.LATER:
865
+ break
866
+ elif alloc_status == AllocStatus.NEVER:
867
+ logger.warning(
868
+ "Failing the request %s because there's not enough kv "
869
+ "cache blocks to run the entire sequence.",
870
+ seq_group.request_id,
871
+ )
872
+ for seq in seq_group.get_seqs():
873
+ seq.status = SequenceStatus.FINISHED_IGNORED
874
+ infeasible_seq_groups.append(seq_group)
875
+ swapped_queue.popleft()
876
+ continue
877
+
878
+ lora_int_id = 0
879
+ if self.lora_enabled:
880
+ lora_int_id = seq_group.lora_int_id
881
+ assert curr_loras is not None
882
+ assert self.lora_config is not None
883
+ if (lora_int_id > 0 and (lora_int_id not in curr_loras)
884
+ and len(curr_loras) >= self.lora_config.max_loras):
885
+ # We don't have a space for another LoRA, so
886
+ # we ignore this request for now.
887
+ leftover_swapped.appendleft(seq_group)
888
+ swapped_queue.popleft()
889
+ continue
890
+
891
+ # The total number of sequences in the RUNNING state should not
892
+ # exceed the maximum number of sequences.
893
+ num_new_seqs = seq_group.get_max_num_running_seqs()
894
+ num_new_tokens_uncached, num_new_tokens_cached = (
895
+ self._get_num_new_uncached_and_cached_tokens(
896
+ seq_group, SequenceStatus.SWAPPED, enable_chunking,
897
+ budget))
898
+
899
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
900
+ num_new_tokens=num_new_tokens_uncached,
901
+ num_new_seqs=num_new_seqs,
902
+ ):
903
+ break
904
+
905
+ if lora_int_id > 0 and curr_loras is not None:
906
+ curr_loras.add(lora_int_id)
907
+ swapped_queue.popleft()
908
+ self._swap_in(seq_group, blocks_to_swap_in)
909
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
910
+ if is_prefill:
911
+ prefill_seq_groups.append(
912
+ ScheduledSequenceGroup(
913
+ seq_group,
914
+ token_chunk_size=num_new_tokens_uncached +
915
+ num_new_tokens_cached,
916
+ ))
917
+ else:
918
+ decode_seq_groups.append(
919
+ ScheduledSequenceGroup(seq_group, token_chunk_size=1))
920
+ budget.add_num_batched_tokens(
921
+ seq_group.request_id,
922
+ num_batched_tokens=num_new_tokens_uncached,
923
+ num_cached_tokens=num_new_tokens_cached,
924
+ )
925
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
926
+
927
+ swapped_queue.extendleft(leftover_swapped)
928
+
929
+ return SchedulerSwappedInOutputs(
930
+ decode_seq_groups=decode_seq_groups,
931
+ prefill_seq_groups=prefill_seq_groups,
932
+ blocks_to_swap_in=blocks_to_swap_in,
933
+ blocks_to_copy=blocks_to_copy,
934
+ num_lookahead_slots=self._get_num_lookahead_slots(
935
+ is_prefill=False, enable_chunking=enable_chunking),
936
+ infeasible_seq_groups=infeasible_seq_groups,
937
+ )
938
+
939
+ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
940
+ if (self.scheduler_config.chunked_prefill_enabled
941
+ and not self.scheduler_config.is_multi_step):
942
+ prompt_limit = self.scheduler_config.max_model_len
943
+ else:
944
+ prompt_limit = min(
945
+ self.scheduler_config.max_model_len,
946
+ self.scheduler_config.max_num_batched_tokens,
947
+ )
948
+
949
+ # Model is fine tuned with long context. Return the fine tuned max_len.
950
+ if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
951
+ assert prompt_limit <= seq_group.lora_request.long_lora_max_len
952
+ return seq_group.lora_request.long_lora_max_len
953
+ else:
954
+ return prompt_limit
955
+
956
+ def _get_priority(self,
957
+ seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
958
+ """Get the priority of the sequence group.
959
+ Highest preference to user-defined priority, followed by arrival time.
960
+ Args:
961
+ seq_group: The sequence group input.
962
+ Returns:
963
+ The priority of the sequence group.
964
+ """
965
+ return seq_group.priority, seq_group.arrival_time
966
+
967
+ def _schedule_priority_preemption(
968
+ self,
969
+ budget: SchedulingBudget,
970
+ ) -> int:
971
+ """Sorts waiting and running queue. Also, force preempt requests
972
+ from the running queue if their priority is lower.
973
+ Priority-based preemption is used with the priority policy.
974
+ Args:
975
+ budget: The scheduling budget. The argument is in-place updated
976
+ when any requests are scheduled.
977
+ Returns:
978
+ A count of priority-based preemptions.
979
+ """
980
+
981
+ waiting_queue = self.waiting
982
+
983
+ running_queue = deque(sorted(self.running, key=self._get_priority))
984
+
985
+ blocks_to_swap_out: List[Tuple[int, int]] = []
986
+ force_preemption_count = 0
987
+
988
+ if waiting_queue:
989
+ seq_group = waiting_queue.popleft()
990
+ num_new_seqs = seq_group.get_max_num_running_seqs()
991
+ num_new_tokens_uncached, _ = \
992
+ self._get_num_new_uncached_and_cached_tokens(
993
+ seq_group, SequenceStatus.WAITING, False, budget)
994
+
995
+ # Only preempt if priority inversion exists
996
+ while running_queue and self._get_priority(
997
+ running_queue[-1]) > self._get_priority(seq_group):
998
+ # Only preempt if waiting sequence cannot be allocated
999
+ can_allocate = self.block_manager.can_allocate(seq_group)
1000
+ if (num_new_tokens_uncached > 0
1001
+ and can_allocate == AllocStatus.OK
1002
+ and budget.can_schedule(
1003
+ num_new_tokens=num_new_tokens_uncached,
1004
+ num_new_seqs=num_new_seqs,
1005
+ )):
1006
+ break
1007
+
1008
+ # Adjust budget to remove the victim sequence group
1009
+ vseq_group = running_queue.pop()
1010
+ num_running_tokens_uncached, _ = (
1011
+ self._get_num_new_uncached_and_cached_tokens(
1012
+ vseq_group, SequenceStatus.RUNNING, False, budget))
1013
+ budget.subtract_num_batched_tokens(
1014
+ vseq_group.request_id, num_running_tokens_uncached)
1015
+ num_running_seqs = vseq_group.get_max_num_running_seqs()
1016
+ budget.subtract_num_seqs(vseq_group.request_id,
1017
+ num_running_seqs)
1018
+
1019
+ # Preempt out the victim sequence group
1020
+ self._preempt(vseq_group, blocks_to_swap_out)
1021
+ waiting_queue.appendleft(vseq_group)
1022
+ force_preemption_count += 1
1023
+ # Put the sequence back into the waiting queue
1024
+ waiting_queue.appendleft(seq_group)
1025
+
1026
+ waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
1027
+
1028
+ self.waiting = waiting_queue
1029
+ self.running = running_queue
1030
+ return force_preemption_count
1031
+
1032
+ def _schedule_prefills(
1033
+ self,
1034
+ budget: SchedulingBudget,
1035
+ curr_loras: Optional[Set[int]],
1036
+ enable_chunking: bool = False,
1037
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1038
+ ) -> SchedulerPrefillOutputs:
1039
+ """Schedule sequence groups that are in prefill stage.
1040
+
1041
+ Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
1042
+ as a new prefill (that starts from beginning -> most recently generated
1043
+ tokens).
1044
+
1045
+ It schedules waiting requests as long as it fits `budget` and
1046
+ curr_loras <= max_lora from the scheduling config. The input arguments
1047
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
1048
+
1049
+ Args:
1050
+ budget: The scheduling budget. The argument is in-place updated
1051
+ when any requests are scheduled.
1052
+ curr_loras: Currently batched lora request ids. The argument is
1053
+ in-place updated when any requests are scheduled.
1054
+ enable_chunking: If True, seq group can be chunked and only a
1055
+ chunked number of tokens are scheduled if
1056
+ `budget.num_batched_tokens` has not enough capacity to schedule
1057
+ all tokens.
1058
+ partial_prefill_metadata: information about the partial prefills
1059
+ that are currently running
1060
+
1061
+ Returns:
1062
+ SchedulerPrefillOutputs.
1063
+ """
1064
+ if budget.remaining_token_budget() == 0:
1065
+ # Do nothing: Can't add any more prefill anyway
1066
+ return SchedulerPrefillOutputs(
1067
+ seq_groups=[],
1068
+ ignored_seq_groups=[],
1069
+ num_lookahead_slots=self._get_num_lookahead_slots(
1070
+ is_prefill=True, enable_chunking=enable_chunking),
1071
+ )
1072
+ ignored_seq_groups: List[SequenceGroup] = []
1073
+ seq_groups: List[ScheduledSequenceGroup] = []
1074
+
1075
+ waiting_queue = self.waiting
1076
+
1077
+ leftover_waiting_sequences: Deque[SequenceGroup] = deque()
1078
+ while self._passed_delay(time.time()) and waiting_queue:
1079
+ seq_group = waiting_queue[0]
1080
+
1081
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
1082
+ assert len(waiting_seqs) == 1, (
1083
+ "Waiting sequence group should have only one prompt "
1084
+ "sequence.")
1085
+ if (partial_prefill_metadata is not None
1086
+ and not partial_prefill_metadata.can_schedule(seq_group)):
1087
+ leftover_waiting_sequences.appendleft(seq_group)
1088
+ waiting_queue.popleft()
1089
+ continue
1090
+ num_new_tokens_uncached, num_new_tokens_cached = (
1091
+ self._get_num_new_uncached_and_cached_tokens(
1092
+ seq_group,
1093
+ SequenceStatus.WAITING,
1094
+ enable_chunking,
1095
+ budget,
1096
+ partial_prefill_metadata=partial_prefill_metadata,
1097
+ ))
1098
+ num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
1099
+
1100
+ if not enable_chunking:
1101
+ num_prompt_tokens = waiting_seqs[0].get_len()
1102
+ assert num_new_tokens == num_prompt_tokens
1103
+
1104
+ prompt_limit = self._get_prompt_limit(seq_group)
1105
+ if num_new_tokens > prompt_limit:
1106
+ logger.warning(
1107
+ "Input prompt (%d tokens) is too long"
1108
+ " and exceeds limit of %d",
1109
+ num_new_tokens,
1110
+ prompt_limit,
1111
+ )
1112
+ for seq in waiting_seqs:
1113
+ seq.status = SequenceStatus.FINISHED_IGNORED
1114
+ ignored_seq_groups.append(seq_group)
1115
+ waiting_queue.popleft()
1116
+ continue
1117
+
1118
+ num_lookahead_slots: int = 0
1119
+ if self.scheduler_config.is_multi_step and enable_chunking:
1120
+ num_lookahead_slots = self._get_num_lookahead_slots(
1121
+ True, enable_chunking)
1122
+
1123
+ # If the sequence group cannot be allocated, stop.
1124
+ can_allocate = self.block_manager.can_allocate(
1125
+ seq_group, num_lookahead_slots=num_lookahead_slots)
1126
+ if can_allocate == AllocStatus.LATER:
1127
+ break
1128
+ elif can_allocate == AllocStatus.NEVER:
1129
+ logger.warning(
1130
+ "Input prompt (%d tokens) + lookahead slots (%d) is "
1131
+ "too long and exceeds the capacity of block_manager",
1132
+ num_new_tokens,
1133
+ num_lookahead_slots,
1134
+ )
1135
+ for seq in waiting_seqs:
1136
+ seq.status = SequenceStatus.FINISHED_IGNORED
1137
+ ignored_seq_groups.append(seq_group)
1138
+ waiting_queue.popleft()
1139
+ continue
1140
+
1141
+ lora_int_id = 0
1142
+ if self.lora_enabled:
1143
+ lora_int_id = seq_group.lora_int_id
1144
+ assert curr_loras is not None
1145
+ assert self.lora_config is not None
1146
+ if (self.lora_enabled and lora_int_id > 0
1147
+ and lora_int_id not in curr_loras
1148
+ and len(curr_loras) >= self.lora_config.max_loras):
1149
+ # We don't have a space for another LoRA, so
1150
+ # we ignore this request for now.
1151
+ leftover_waiting_sequences.appendleft(seq_group)
1152
+ waiting_queue.popleft()
1153
+ continue
1154
+
1155
+ if (budget.num_batched_tokens
1156
+ >= self.scheduler_config.max_num_batched_tokens):
1157
+ # We've reached the budget limit - since there might be
1158
+ # continuous prefills in the running queue, we should break
1159
+ # to avoid scheduling any new prefills.
1160
+ break
1161
+
1162
+ num_new_seqs = seq_group.get_max_num_running_seqs()
1163
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
1164
+ num_new_tokens=num_new_tokens_uncached,
1165
+ num_new_seqs=num_new_seqs,
1166
+ ):
1167
+ break
1168
+
1169
+ # Can schedule this request.
1170
+ if curr_loras is not None and lora_int_id > 0:
1171
+ curr_loras.add(lora_int_id)
1172
+ waiting_queue.popleft()
1173
+ self._allocate_and_set_running(seq_group)
1174
+
1175
+ if partial_prefill_metadata is not None:
1176
+ partial_prefill_metadata.maybe_increment_partial_prefills(
1177
+ seq_group)
1178
+
1179
+ if enable_chunking and self.scheduler_config.is_multi_step:
1180
+ blocks_to_copy: List[Tuple[int, int]] = []
1181
+ # init_multi_step_from_lookahead_slots happens in append_slots
1182
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
1183
+ # This assert will trip when a copy-on-write happens. This is
1184
+ # not a concern as the very first sequence-group block
1185
+ # allocation happens above. Still, we have the assert to
1186
+ # catch any edge-cases.
1187
+ assert not blocks_to_copy
1188
+ else:
1189
+ seq_group.init_multi_step_from_lookahead_slots(
1190
+ num_lookahead_slots,
1191
+ num_scheduler_steps=self.scheduler_config.
1192
+ num_scheduler_steps,
1193
+ is_multi_step=self.scheduler_config.is_multi_step,
1194
+ enable_chunking=enable_chunking,
1195
+ )
1196
+
1197
+ seq_groups.append(
1198
+ ScheduledSequenceGroup(seq_group=seq_group,
1199
+ token_chunk_size=num_new_tokens))
1200
+ budget.add_num_batched_tokens(
1201
+ seq_group.request_id,
1202
+ num_batched_tokens=num_new_tokens_uncached,
1203
+ num_cached_tokens=num_new_tokens_cached,
1204
+ )
1205
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
1206
+
1207
+ # Queue requests that couldn't be scheduled.
1208
+ waiting_queue.extendleft(leftover_waiting_sequences)
1209
+ if len(seq_groups) > 0:
1210
+ self.prev_prompt = True
1211
+
1212
+ return SchedulerPrefillOutputs(
1213
+ seq_groups=seq_groups,
1214
+ ignored_seq_groups=ignored_seq_groups,
1215
+ num_lookahead_slots=self._get_num_lookahead_slots(
1216
+ is_prefill=True, enable_chunking=enable_chunking),
1217
+ )
1218
+
1219
+ def _schedule_default(self) -> SchedulerOutputs:
1220
+ """Schedule queued requests.
1221
+
1222
+ The current policy is designed to optimize the throughput. First,
1223
+ it batches as many prefill requests as possible. And it schedules
1224
+ decodes. If there's a pressure on GPU memory, decode requests can
1225
+ be swapped or preempted.
1226
+ """
1227
+ # Include running requests to the budget.
1228
+ budget = SchedulingBudget(
1229
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1230
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1231
+ )
1232
+ # Make sure we include num running seqs before scheduling prefill,
1233
+ # so that we don't schedule beyond max_num_seqs for prefill.
1234
+ for seq_group in self.running:
1235
+ budget.add_num_seqs(seq_group.request_id,
1236
+ seq_group.get_max_num_running_seqs())
1237
+ curr_loras = (set(
1238
+ seq_group.lora_int_id for seq_group in self.running
1239
+ if seq_group.lora_int_id > 0) if self.lora_enabled else None)
1240
+
1241
+ prefills = SchedulerPrefillOutputs.create_empty()
1242
+ running_scheduled = SchedulerRunningOutputs.create_empty()
1243
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1244
+
1245
+ # If any requests are swapped, prioritized swapped requests.
1246
+ if not self.swapped:
1247
+ prefills = self._schedule_prefills(budget,
1248
+ curr_loras,
1249
+ enable_chunking=False)
1250
+
1251
+ if len(prefills.seq_groups
1252
+ ) == 0 and self.scheduler_config.policy == "priority":
1253
+ self._schedule_priority_preemption(budget)
1254
+
1255
+ # Don't schedule decodes if prefills are scheduled.
1256
+ # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
1257
+ # only contains decode requests, not chunked prefills.
1258
+ if len(prefills.seq_groups) == 0:
1259
+ running_scheduled = self._schedule_running(budget,
1260
+ curr_loras,
1261
+ enable_chunking=False)
1262
+
1263
+ # If any sequence group is preempted, do not swap in any sequence
1264
+ # group. because it means there's no slot for new running requests.
1265
+ if (len(running_scheduled.preempted) +
1266
+ len(running_scheduled.swapped_out) == 0):
1267
+ swapped_in = \
1268
+ self._schedule_swapped(budget, curr_loras)
1269
+
1270
+ assert (budget.num_batched_tokens
1271
+ <= self.scheduler_config.max_num_batched_tokens)
1272
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1273
+
1274
+ # Update waiting requests.
1275
+ self.waiting.extendleft(running_scheduled.preempted)
1276
+ # Update new running requests.
1277
+ if len(prefills.seq_groups) > 0:
1278
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1279
+
1280
+ self.running.extend(running_scheduled.decode_seq_groups_list)
1281
+
1282
+ if len(swapped_in.decode_seq_groups) > 0:
1283
+ self.running.extend(
1284
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1285
+
1286
+ # Update swapped requests.
1287
+ self.swapped.extend(running_scheduled.swapped_out)
1288
+ preempted = len(running_scheduled.preempted) + len(
1289
+ running_scheduled.swapped_out)
1290
+
1291
+ # There should be no prefill from running queue because this policy
1292
+ # doesn't allow chunked prefills.
1293
+ assert len(running_scheduled.prefill_seq_groups) == 0
1294
+ assert len(swapped_in.prefill_seq_groups) == 0
1295
+
1296
+ # Merge lists
1297
+ num_prefill_groups = len(prefills.seq_groups)
1298
+ if num_prefill_groups > 0:
1299
+ scheduled_seq_groups = prefills.seq_groups
1300
+ scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
1301
+ else:
1302
+ scheduled_seq_groups = running_scheduled.decode_seq_groups
1303
+ scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
1304
+
1305
+ blocks_to_copy = running_scheduled.blocks_to_copy
1306
+ blocks_to_copy.extend(swapped_in.blocks_to_copy)
1307
+
1308
+ ignored_seq_groups = prefills.ignored_seq_groups
1309
+ ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
1310
+
1311
+ return SchedulerOutputs(
1312
+ scheduled_seq_groups=scheduled_seq_groups,
1313
+ num_prefill_groups=num_prefill_groups,
1314
+ num_batched_tokens=budget.num_batched_tokens +
1315
+ budget.num_cached_tokens,
1316
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1317
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1318
+ blocks_to_copy=blocks_to_copy,
1319
+ ignored_seq_groups=ignored_seq_groups,
1320
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
1321
+ running_queue_size=len(self.running),
1322
+ preempted=preempted,
1323
+ )
1324
+
1325
+ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
1326
+ """Schedule queued requests.
1327
+
1328
+ Chunked prefill allows to chunk prefill requests, batch them together
1329
+ with decode requests. This policy 1. schedule as many decoding requests
1330
+ as possible. 2. schedule chunked prefill requests that are not
1331
+ finished. 3. schedule swapped request. 4. schedule new prefill
1332
+ requests.
1333
+
1334
+ The policy can sustain the high GPU utilization because it can put
1335
+ prefill and decodes requests to the same batch, while it improves
1336
+ inter token latency because decodes requests don't need to be blocked
1337
+ by prefill requests.
1338
+ """
1339
+ budget = SchedulingBudget(
1340
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1341
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1342
+ )
1343
+ curr_loras: Set[int] = set()
1344
+
1345
+ prefills = SchedulerPrefillOutputs.create_empty()
1346
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1347
+
1348
+ # Create partial prefill metadata
1349
+ partial_prefill_metadata = PartialPrefillMetadata.from_queues(
1350
+ running=self.running,
1351
+ waiting=self.waiting,
1352
+ scheduler_config=self.scheduler_config,
1353
+ )
1354
+
1355
+ # Decoding should be always scheduled first by fcfs.
1356
+ running_scheduled = self._schedule_running(
1357
+ budget,
1358
+ curr_loras,
1359
+ enable_chunking=True,
1360
+ partial_prefill_metadata=partial_prefill_metadata,
1361
+ )
1362
+
1363
+ # Schedule swapped out requests.
1364
+ # If preemption happens, it means we don't have space for swap-in.
1365
+ if len(running_scheduled.preempted) + len(
1366
+ running_scheduled.swapped_out) == 0:
1367
+ swapped_in = self._schedule_swapped(budget, curr_loras)
1368
+
1369
+ prefills = self._schedule_prefills(
1370
+ budget,
1371
+ curr_loras,
1372
+ enable_chunking=True,
1373
+ partial_prefill_metadata=partial_prefill_metadata,
1374
+ )
1375
+
1376
+ assert (budget.num_batched_tokens
1377
+ <= self.scheduler_config.max_num_batched_tokens)
1378
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1379
+
1380
+ # Update waiting requests.
1381
+ self.waiting.extendleft(running_scheduled.preempted)
1382
+
1383
+ # Update new running requests.
1384
+ # By default, vLLM scheduler prioritizes prefills.
1385
+ # Once chunked prefill is enabled,
1386
+ # the policy is changed to prioritize decode requests.
1387
+ self.running.extend(
1388
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1389
+ self.running.extend(
1390
+ [s.seq_group for s in swapped_in.prefill_seq_groups])
1391
+ self.running.extend(
1392
+ [s.seq_group for s in running_scheduled.decode_seq_groups])
1393
+ # Because multiple prefills may be running concurrently, we need to
1394
+ # make sure that prefills which are scheduled to finish are listed
1395
+ # before those that won't. This is so that on the next scheduling
1396
+ # iteration when they have transitioned to the decode stage, they are
1397
+ # properly prioritized over sequences that are still in the prefill
1398
+ # stage.
1399
+ self.running.extend(
1400
+ self._order_finishing_prefills_first(
1401
+ running_scheduled.prefill_seq_groups))
1402
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1403
+
1404
+ # Update swapped requests.
1405
+ self.swapped.extend(running_scheduled.swapped_out)
1406
+ # Put prefills first due to Attention backend ordering assumption.
1407
+ scheduled_seq_groups = (prefills.seq_groups +
1408
+ running_scheduled.prefill_seq_groups +
1409
+ swapped_in.prefill_seq_groups +
1410
+ running_scheduled.decode_seq_groups +
1411
+ swapped_in.decode_seq_groups)
1412
+ num_prefill_groups = (len(prefills.seq_groups) +
1413
+ len(swapped_in.prefill_seq_groups) +
1414
+ len(running_scheduled.prefill_seq_groups))
1415
+ # If all prompts, then we set num_lookahead_slots to 0
1416
+ # this allows us to go through the `no_spec` path in
1417
+ # `spec_decode_worker.py`
1418
+ all_prefills = len(scheduled_seq_groups) == num_prefill_groups
1419
+ num_lookahead_slots = (0 if
1420
+ (all_prefills
1421
+ and not self.scheduler_config.is_multi_step)
1422
+ else running_scheduled.num_lookahead_slots)
1423
+ return SchedulerOutputs(
1424
+ scheduled_seq_groups=scheduled_seq_groups,
1425
+ num_prefill_groups=num_prefill_groups,
1426
+ num_batched_tokens=budget.num_batched_tokens +
1427
+ budget.num_cached_tokens,
1428
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1429
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1430
+ blocks_to_copy=running_scheduled.blocks_to_copy +
1431
+ swapped_in.blocks_to_copy,
1432
+ ignored_seq_groups=prefills.ignored_seq_groups +
1433
+ swapped_in.infeasible_seq_groups,
1434
+ num_lookahead_slots=num_lookahead_slots,
1435
+ running_queue_size=len(self.running),
1436
+ preempted=(len(running_scheduled.preempted) +
1437
+ len(running_scheduled.swapped_out)),
1438
+ )
1439
+
1440
+ def _order_finishing_prefills_first(
1441
+ self, scheduled_prefill_seqs: List[ScheduledSequenceGroup]
1442
+ ) -> List[SequenceGroup]:
1443
+ """Returns a list of prefilling SequenceGroups where sequences that are
1444
+ scheduled to finish prefilling are listed first"""
1445
+ finishing = [
1446
+ s.seq_group for s in scheduled_prefill_seqs
1447
+ if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size
1448
+ ]
1449
+ not_finishing = [
1450
+ s.seq_group for s in scheduled_prefill_seqs
1451
+ if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size
1452
+ ]
1453
+ return finishing + not_finishing
1454
+
1455
+ def _schedule(self) -> SchedulerOutputs:
1456
+ """Schedule queued requests."""
1457
+ if self.scheduler_config.chunked_prefill_enabled:
1458
+ return self._schedule_chunked_prefill()
1459
+ else:
1460
+ return self._schedule_default()
1461
+
1462
+ def _can_append_slots(self, seq_group: SequenceGroup,
1463
+ enable_chunking: bool) -> bool:
1464
+ """Determine whether or not we have enough space in the KV cache to
1465
+ continue generation of the sequence group.
1466
+ """
1467
+ # It is True only for testing case to trigger artificial preemption.
1468
+ if (self.enable_artificial_preemption
1469
+ and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
1470
+ and self.artificial_preempt_cnt > 0):
1471
+ self.artificial_preempt_cnt -= 1
1472
+ return False
1473
+
1474
+ is_prefill = seq_group.is_prefill()
1475
+ num_lookahead_slots = self._get_num_lookahead_slots(
1476
+ is_prefill, enable_chunking)
1477
+
1478
+ if is_prefill and num_lookahead_slots > 0:
1479
+ # Appending prefill slots only happens multi-step and
1480
+ # chunked-prefill are enabled together.
1481
+ assert self.scheduler_config.is_multi_step and enable_chunking
1482
+
1483
+ return self.block_manager.can_append_slots(
1484
+ seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
1485
+
1486
+ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
1487
+ # async_output_proc is allowed only when we have a single sequence
1488
+ # in the sequence group
1489
+ no_single_seq = seq_group.sampling_params is None or (
1490
+ seq_group.sampling_params.n == 1)
1491
+ return no_single_seq
1492
+
1493
+ def schedule(
1494
+ self
1495
+ ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
1496
+ # Schedule sequence groups.
1497
+ # This function call changes the internal states of the scheduler
1498
+ # such as self.running, self.swapped, and self.waiting.
1499
+ scheduler_start_time = time.perf_counter()
1500
+
1501
+ scheduler_outputs: SchedulerOutputs = self._schedule()
1502
+ now = time.time()
1503
+
1504
+ if not self.cache_config.enable_prefix_caching:
1505
+ common_computed_block_nums = []
1506
+
1507
+ allow_async_output_proc: bool = self.use_async_output_proc
1508
+
1509
+ # Create input data structures.
1510
+ seq_group_metadata_list: List[SequenceGroupMetadata] = []
1511
+ for i, scheduled_seq_group in enumerate(
1512
+ scheduler_outputs.scheduled_seq_groups):
1513
+ seq_group = scheduled_seq_group.seq_group
1514
+ token_chunk_size = scheduled_seq_group.token_chunk_size
1515
+ seq_group.maybe_set_first_scheduled_time(now)
1516
+
1517
+ seq_group_metadata = self._seq_group_metadata_cache[
1518
+ self.cache_id].get_object()
1519
+ seq_group_metadata.seq_data.clear()
1520
+ seq_group_metadata.block_tables.clear()
1521
+
1522
+ # seq_id -> SequenceData
1523
+ seq_data: Dict[int, SequenceData] = {}
1524
+ # seq_id -> physical block numbers
1525
+ block_tables: Dict[int, List[int]] = {}
1526
+
1527
+ if seq_group.is_encoder_decoder():
1528
+ # Encoder associated with SequenceGroup
1529
+ encoder_seq = seq_group.get_encoder_seq()
1530
+ assert encoder_seq is not None
1531
+ encoder_seq_data = encoder_seq.data
1532
+ # Block table for cross-attention
1533
+ # Also managed at SequenceGroup level
1534
+ cross_block_table = self.block_manager.get_cross_block_table(
1535
+ seq_group)
1536
+ else:
1537
+ encoder_seq_data = None
1538
+ cross_block_table = None
1539
+
1540
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1541
+ seq_id = seq.seq_id
1542
+ seq_data[seq_id] = seq.data
1543
+ block_tables[seq_id] = self.block_manager.get_block_table(seq)
1544
+ self.block_manager.access_all_blocks_in_seq(seq, now)
1545
+
1546
+ if self.cache_config.enable_prefix_caching:
1547
+ common_computed_block_nums = (
1548
+ self.block_manager.get_common_computed_block_ids(
1549
+ seq_group.get_seqs(status=SequenceStatus.RUNNING)))
1550
+
1551
+ do_sample = True
1552
+ is_prompt = seq_group.is_prefill()
1553
+ # We should send the metadata to workers when the first prefill
1554
+ # is sent. Subsequent requests could be chunked prefill or decode.
1555
+ is_first_prefill = False
1556
+ if is_prompt:
1557
+ seqs = seq_group.get_seqs()
1558
+ # Prefill has only 1 sequence.
1559
+ assert len(seqs) == 1
1560
+ num_computed_tokens = seqs[0].data.get_num_computed_tokens()
1561
+ is_first_prefill = num_computed_tokens == 0
1562
+ # In the next iteration, all prompt tokens are not computed.
1563
+ # It means the prefill is chunked, and we don't need sampling.
1564
+ # NOTE: We use get_len instead of get_prompt_len because when
1565
+ # a sequence is preempted, prefill includes previous generated
1566
+ # output tokens.
1567
+ if (token_chunk_size + num_computed_tokens
1568
+ < seqs[0].data.get_len()):
1569
+ do_sample = False
1570
+
1571
+ # It assumes the scheduled_seq_groups is ordered by
1572
+ # prefill < decoding.
1573
+ if is_first_prefill or not self.scheduler_config.send_delta_data:
1574
+ seq_group_metadata = SequenceGroupMetadata(
1575
+ request_id=seq_group.request_id,
1576
+ is_prompt=is_prompt,
1577
+ seq_data=seq_data,
1578
+ sampling_params=seq_group.sampling_params,
1579
+ block_tables=block_tables,
1580
+ do_sample=do_sample,
1581
+ pooling_params=seq_group.pooling_params,
1582
+ token_chunk_size=token_chunk_size,
1583
+ lora_request=seq_group.lora_request,
1584
+ computed_block_nums=common_computed_block_nums,
1585
+ encoder_seq_data=encoder_seq_data,
1586
+ cross_block_table=cross_block_table,
1587
+ state=seq_group.state,
1588
+ token_type_ids=seq_group.token_type_ids,
1589
+ # `multi_modal_data` will only be present for the 1st comm
1590
+ # between engine and worker.
1591
+ # the subsequent comms can still use delta, but
1592
+ # `multi_modal_data` will be None.
1593
+ multi_modal_data=(seq_group.multi_modal_data
1594
+ if scheduler_outputs.num_prefill_groups
1595
+ > 0 else None),
1596
+ multi_modal_placeholders=(
1597
+ seq_group.multi_modal_placeholders
1598
+ if scheduler_outputs.num_prefill_groups > 0 else None),
1599
+ prompt_adapter_request=seq_group.prompt_adapter_request,
1600
+ )
1601
+ else:
1602
+ # When SPMD mode is enabled, we only send delta data except for
1603
+ # the first request to reduce serialization cost.
1604
+ seq_data_delta = {}
1605
+ for id, data in seq_data.items():
1606
+ seq_data_delta[id] = data.get_delta_and_reset()
1607
+ seq_group_metadata = SequenceGroupMetadataDelta(
1608
+ seq_data_delta,
1609
+ seq_group.request_id,
1610
+ block_tables,
1611
+ is_prompt,
1612
+ do_sample=do_sample,
1613
+ token_chunk_size=token_chunk_size,
1614
+ computed_block_nums=common_computed_block_nums,
1615
+ )
1616
+ seq_group_metadata_list.append(seq_group_metadata)
1617
+
1618
+ if allow_async_output_proc:
1619
+ allow_async_output_proc = self._allow_async_output_proc(
1620
+ seq_group)
1621
+
1622
+ # Now that the batch has been created, we can assume all blocks in the
1623
+ # batch will have been computed before the next scheduling invocation.
1624
+ # This is because the engine assumes that a failure in model execution
1625
+ # will crash the vLLM instance / will not retry.
1626
+ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
1627
+ self.block_manager.mark_blocks_as_computed(
1628
+ scheduled_seq_group.seq_group,
1629
+ scheduled_seq_group.token_chunk_size)
1630
+
1631
+ self._seq_group_metadata_cache[self.next_cache_id].reset()
1632
+
1633
+ scheduler_time = time.perf_counter() - scheduler_start_time
1634
+ # Add this to scheduler time to all the sequences that are currently
1635
+ # running. This will help estimate if the scheduler is a significant
1636
+ # component in the e2e latency.
1637
+ for seq_group in self.running:
1638
+ if seq_group is not None and seq_group.metrics is not None:
1639
+ if seq_group.metrics.scheduler_time is not None:
1640
+ seq_group.metrics.scheduler_time += scheduler_time
1641
+ else:
1642
+ seq_group.metrics.scheduler_time = scheduler_time
1643
+
1644
+ # Move to next cache (if exists)
1645
+ self.cache_id = self.next_cache_id
1646
+
1647
+ # Return results
1648
+ return (seq_group_metadata_list, scheduler_outputs,
1649
+ allow_async_output_proc)
1650
+
1651
+ def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
1652
+ self.block_manager.fork(parent_seq, child_seq)
1653
+
1654
+ def free_seq(self, seq: Sequence) -> None:
1655
+ """Free a sequence from a block table."""
1656
+ self.block_manager.free(seq)
1657
+
1658
+ def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
1659
+ """Free finished seqs in a sequence group."""
1660
+ for seq in seq_group.get_seqs():
1661
+ if seq.is_finished():
1662
+ self.free_seq(seq)
1663
+
1664
+ def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
1665
+ if seq_group.is_finished():
1666
+ # Free cross-attention block table, if it exists
1667
+ self._free_seq_group_cross_attn_blocks(seq_group)
1668
+
1669
+ # Add the finished requests to the finished requests list.
1670
+ # This list will be used to update the Mamba cache in the
1671
+ # next step.
1672
+ self._finished_requests_ids.append(seq_group.request_id)
1673
+
1674
+ # Free finished seqs
1675
+ self._free_finished_seqs(seq_group)
1676
+
1677
+ def free_finished_seq_groups(self) -> None:
1678
+ remaining: Deque[SequenceGroup] = deque()
1679
+ for seq_group in self.running:
1680
+ self._free_finished_seq_group(seq_group)
1681
+ if not seq_group.is_finished():
1682
+ remaining.append(seq_group)
1683
+
1684
+ self.running = remaining
1685
+
1686
+ # Handle async stopped sequence groups
1687
+ # (ones that reached max model len)
1688
+ if self._async_stopped:
1689
+ for seq_group in self._async_stopped:
1690
+ self._free_seq_group_cross_attn_blocks(seq_group)
1691
+ self._finished_requests_ids.append(seq_group.request_id)
1692
+
1693
+ # Free finished seqs
1694
+ self._free_finished_seqs(seq_group)
1695
+
1696
+ self._async_stopped.clear()
1697
+
1698
+ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
1699
+ self.block_manager.allocate(seq_group)
1700
+ for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
1701
+ seq.status = SequenceStatus.RUNNING
1702
+
1703
+ def _append_slots(
1704
+ self,
1705
+ seq_group: SequenceGroup,
1706
+ blocks_to_copy: List[Tuple[int, int]],
1707
+ enable_chunking: bool = False,
1708
+ ) -> None:
1709
+ """Appends new slots to the sequences in the given sequence group.
1710
+
1711
+ Args:
1712
+ seq_group (SequenceGroup): The sequence group containing the
1713
+ sequences to append slots to.
1714
+ blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
1715
+ ints, the first int is the source block index, and the second
1716
+ int is the destination block index. This list is updated with
1717
+ the new source and destination block indices for the appended
1718
+ slots.
1719
+ enable_chunking (bool): True if chunked prefill is enabled.
1720
+ """
1721
+ is_prefill: bool = seq_group.is_prefill()
1722
+ num_lookahead_slots: int = self._get_num_lookahead_slots(
1723
+ is_prefill, enable_chunking)
1724
+
1725
+ seq_group.init_multi_step_from_lookahead_slots(
1726
+ num_lookahead_slots,
1727
+ num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
1728
+ is_multi_step=self.scheduler_config.is_multi_step,
1729
+ enable_chunking=enable_chunking,
1730
+ )
1731
+
1732
+ seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
1733
+ if self.scheduler_config.is_multi_step and enable_chunking:
1734
+ # In multi-step chunked-prefill any sequence type can have
1735
+ # slots appended.
1736
+ seq_status = None
1737
+
1738
+ for seq in seq_group.get_seqs(status=seq_status):
1739
+ cows = self.block_manager.append_slots(seq, num_lookahead_slots)
1740
+ if len(cows) > 0:
1741
+ blocks_to_copy.extend(cows)
1742
+
1743
+ def _preempt(self, seq_group: SequenceGroup,
1744
+ blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
1745
+ # If preemption mode is not specified, we determine the mode as follows:
1746
+ # We use recomputation by default since it incurs lower overhead than
1747
+ # swapping. However, when the sequence group has multiple sequences
1748
+ # (e.g., beam search), recomputation is not currently supported. In
1749
+ # such a case, we use swapping instead.
1750
+ # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
1751
+ # As swapped sequences are prioritized over waiting sequences,
1752
+ # sequence groups with multiple sequences are implicitly prioritized
1753
+ # over sequence groups with a single sequence.
1754
+ # TODO(woosuk): Support recomputation for sequence groups with multiple
1755
+ # sequences. This may require a more sophisticated CUDA kernel.
1756
+ if self.user_specified_preemption_mode is None:
1757
+ if seq_group.get_max_num_running_seqs() == 1:
1758
+ preemption_mode = PreemptionMode.RECOMPUTE
1759
+ else:
1760
+ preemption_mode = PreemptionMode.SWAP
1761
+
1762
+ elif self.user_specified_preemption_mode == "swap":
1763
+ preemption_mode = PreemptionMode.SWAP
1764
+ else:
1765
+ preemption_mode = PreemptionMode.RECOMPUTE
1766
+
1767
+ if self.num_cumulative_preemption % 50 == 0:
1768
+ logger.warning(
1769
+ "Sequence group %s is preempted by %s mode because there is "
1770
+ "not enough KV cache space. This can affect the end-to-end "
1771
+ "performance. Increase gpu_memory_utilization or "
1772
+ "tensor_parallel_size to provide more KV cache memory. "
1773
+ "total_num_cumulative_preemption=%d",
1774
+ seq_group.request_id,
1775
+ preemption_mode,
1776
+ self.num_cumulative_preemption + 1,
1777
+ )
1778
+ self.num_cumulative_preemption += 1
1779
+
1780
+ if preemption_mode == PreemptionMode.RECOMPUTE:
1781
+ self._preempt_by_recompute(seq_group)
1782
+ elif preemption_mode == PreemptionMode.SWAP:
1783
+ self._preempt_by_swap(seq_group, blocks_to_swap_out)
1784
+ else:
1785
+ raise AssertionError("Invalid preemption mode.")
1786
+ return preemption_mode
1787
+
1788
+ def _preempt_by_recompute(
1789
+ self,
1790
+ seq_group: SequenceGroup,
1791
+ ) -> None:
1792
+ seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
1793
+ assert len(seqs) == 1
1794
+ for seq in seqs:
1795
+ seq.status = SequenceStatus.WAITING
1796
+ self.free_seq(seq)
1797
+ seq.reset_state_for_recompute()
1798
+ self._free_seq_group_cross_attn_blocks(seq_group)
1799
+
1800
+ def _preempt_by_swap(
1801
+ self,
1802
+ seq_group: SequenceGroup,
1803
+ blocks_to_swap_out: List[Tuple[int, int]],
1804
+ ) -> None:
1805
+ self._swap_out(seq_group, blocks_to_swap_out)
1806
+
1807
+ def _swap_in(
1808
+ self,
1809
+ seq_group: SequenceGroup,
1810
+ blocks_to_swap_in: List[Tuple[int, int]],
1811
+ ) -> None:
1812
+ mapping = self.block_manager.swap_in(seq_group)
1813
+ blocks_to_swap_in.extend(mapping)
1814
+ for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
1815
+ seq.status = SequenceStatus.RUNNING
1816
+
1817
+ def _swap_out(
1818
+ self,
1819
+ seq_group: SequenceGroup,
1820
+ blocks_to_swap_out: List[Tuple[int, int]],
1821
+ ) -> None:
1822
+ if not self.block_manager.can_swap_out(seq_group):
1823
+ # FIXME(woosuk): Abort the sequence group instead of aborting the
1824
+ # entire engine.
1825
+ raise RuntimeError(
1826
+ "Aborted due to the lack of CPU swap space. Please increase "
1827
+ "the swap space to avoid this error.")
1828
+ mapping = self.block_manager.swap_out(seq_group)
1829
+ blocks_to_swap_out.extend(mapping)
1830
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1831
+ seq.status = SequenceStatus.SWAPPED
1832
+
1833
+ def _passed_delay(self, now: float) -> bool:
1834
+ if self.prev_prompt:
1835
+ self.last_prompt_latency = now - self.prev_time
1836
+ self.prev_time, self.prev_prompt = now, False
1837
+ # Delay scheduling prompts to let waiting queue fill up
1838
+ if self.scheduler_config.delay_factor > 0 and self.waiting:
1839
+ earliest_arrival_time = min(
1840
+ [e.metrics.arrival_time for e in self.waiting])
1841
+ passed_delay = ((now - earliest_arrival_time)
1842
+ > (self.scheduler_config.delay_factor *
1843
+ self.last_prompt_latency) or not self.running)
1844
+ else:
1845
+ passed_delay = True
1846
+ return passed_delay
1847
+
1848
+ def _get_num_lookahead_slots(self, is_prefill: bool,
1849
+ enable_chunking: bool) -> int:
1850
+ """The number of slots to allocate per sequence per step, beyond known
1851
+ token ids. Speculative decoding uses these slots to store KV activations
1852
+ of tokens which may or may not be accepted.
1853
+
1854
+ Speculative decoding does not yet support prefill, so we do not perform
1855
+ lookahead allocation for prefill.
1856
+
1857
+ When chunking is enabled with multi-step, we allocate lookahead slots
1858
+ for the prefills for when the prefills turn into decodes in the first
1859
+ step.
1860
+ """
1861
+ if is_prefill:
1862
+ if self.scheduler_config.is_multi_step and enable_chunking:
1863
+ # num_lookahead_slots was introduced in the context of decodes,
1864
+ # in Speculative Decoding.
1865
+ # When the num_scheduler_steps is 8, say, then the
1866
+ # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
1867
+ # decode anyways and we wish to do 7 more.
1868
+ #
1869
+ # "lookaheads" for prefills, is introduced in support for
1870
+ # Chunked-Prefill in Multi-Step.
1871
+ return self.scheduler_config.num_lookahead_slots + 1
1872
+ else:
1873
+ return 0
1874
+
1875
+ return self.scheduler_config.num_lookahead_slots
1876
+
1877
+ def _get_num_new_uncached_and_cached_tokens(
1878
+ self,
1879
+ seq_group: SequenceGroup,
1880
+ status: SequenceStatus,
1881
+ enable_chunking: bool,
1882
+ budget: SchedulingBudget,
1883
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1884
+ ) -> Tuple[int, int]:
1885
+ """
1886
+ Returns the number of new uncached and cached tokens to schedule for a
1887
+ given sequence group that's in a given `status`.
1888
+
1889
+ The API could chunk the number of tokens to compute based on `budget`
1890
+ if `enable_chunking` is True. If a sequence group has multiple
1891
+ sequences (e.g., running beam search), it means it is in decoding
1892
+ phase, so chunking doesn't happen.
1893
+
1894
+ Returns (0, 0) if the new token cannot be computed due to token budget.
1895
+
1896
+ The cached tokens's blocks are already computed, and the attention
1897
+ backend will reuse the cached blocks rather than recomputing them. So
1898
+ the scheduler could schedule these cached tokens "for free".
1899
+
1900
+ Args:
1901
+ seq_group: The sequence group to get the number of new tokens to
1902
+ schedule.
1903
+ status: The status of the sequences to get the number of new tokens
1904
+ to schedule.
1905
+ enable_chunking: Whether to chunk the number of tokens to compute.
1906
+ budget: The budget to chunk the number of tokens to compute.
1907
+ partial_prefill_metadata: information about the partial prefills
1908
+ that are currently running
1909
+
1910
+
1911
+ Returns:
1912
+ A tuple of two ints. The first int is the number of new uncached
1913
+ tokens to schedule. The second int is the number of cached tokens.
1914
+ If no more new tokens can be scheduled, returns (0, 0).
1915
+ """
1916
+ num_cached_new_tokens = 0
1917
+ num_uncached_new_tokens = 0
1918
+
1919
+ seqs = seq_group.get_seqs(status=status)
1920
+ # Compute the number of new uncached and cached tokens for
1921
+ # each sequence.
1922
+ for seq in seqs:
1923
+ if not seq.is_prefill():
1924
+ # Decode sequences should always just have 1 uncached token
1925
+ # TODO(rickyx): Actually is this still correct for multi-step?
1926
+ num_uncached_new_tokens += 1
1927
+ continue
1928
+
1929
+ num_computed_tokens_seq = seq.get_num_computed_tokens()
1930
+ all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
1931
+ if not self.cache_config.enable_prefix_caching:
1932
+ # If prefix caching is not enabled, all new tokens are uncached.
1933
+ num_uncached_new_tokens += all_num_new_tokens_seq
1934
+ continue
1935
+
1936
+ # NOTE: the cache token might be currently in a block that's in an
1937
+ # evictor meaning that it's not yet allocated. However, we don't
1938
+ # exclude such tokens in the cache count because it will be
1939
+ # guaranteed to be allocated later if the sequence can be allocated.
1940
+ num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
1941
+ seq)
1942
+
1943
+ # Sanity check.
1944
+ if num_cached_tokens_seq < num_computed_tokens_seq:
1945
+ # This should only happen with chunked prefill, and
1946
+ # the seq is still in prefill. The `num_cached_tokens_seq`
1947
+ # is the value we calculated on scheduling the first prefill.
1948
+ # For subsequent continuous prefill steps, we cached the
1949
+ # number of cache tokens for the sequence so the cached token
1950
+ # count could be less than the number of computed tokens.
1951
+ # See comments on `ComputedBlocksTracker` for more details.
1952
+ assert (
1953
+ seq.is_prefill() and seq.status == SequenceStatus.RUNNING
1954
+ and self.scheduler_config.chunked_prefill_enabled
1955
+ ), ("Number of cached tokens should not be less than the "
1956
+ "number of computed tokens for a sequence that's still "
1957
+ f"in prefill. But there are {num_cached_tokens_seq} cached "
1958
+ f"tokens and {num_computed_tokens_seq} computed tokens "
1959
+ f"for sequence {seq.seq_id}.")
1960
+
1961
+ num_cached_new_tokens_seq = max(
1962
+ 0, num_cached_tokens_seq - num_computed_tokens_seq)
1963
+ num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
1964
+ num_cached_new_tokens_seq)
1965
+
1966
+ num_uncached_new_tokens += num_uncached_new_tokens_seq
1967
+ num_cached_new_tokens += num_cached_new_tokens_seq
1968
+
1969
+ if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
1970
+ # For a fully cached hit sequence, we actually need to recompute the
1971
+ # last token. So we need at least 1 uncached token to schedule.
1972
+ # See ModelRunner._compute_for_prefix_cache_hit for more details.
1973
+ num_uncached_new_tokens = 1
1974
+ num_cached_new_tokens -= 1
1975
+
1976
+ if enable_chunking and len(seqs) == 1:
1977
+ # Chunk if a running request cannot fit in the given budget.
1978
+ # If number of seq > 1, it means it is doing beam search
1979
+ # in a decode phase. Do not chunk.
1980
+ num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
1981
+ self.scheduler_config,
1982
+ self.cache_config,
1983
+ budget,
1984
+ self._get_prompt_limit(seq_group),
1985
+ num_uncached_new_tokens,
1986
+ self.partial_prefill_budget_lookup_list,
1987
+ partial_prefill_metadata,
1988
+ )
1989
+
1990
+ return num_uncached_new_tokens, num_cached_new_tokens
1991
+
1992
+ @staticmethod
1993
+ def _chunk_new_tokens_to_schedule(
1994
+ scheduler_config: SchedulerConfig,
1995
+ cache_config: CacheConfig,
1996
+ budget: SchedulingBudget,
1997
+ prompt_limit: int,
1998
+ num_new_tokens: int,
1999
+ partial_prefill_budget_lookup_list: List[int],
2000
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
2001
+ ) -> int:
2002
+ """
2003
+ Chunks the number of new tokens to schedule based on the budget when
2004
+ chunked prefill is enabled.
2005
+
2006
+ Args:
2007
+ scheduler_config: The scheduler config.
2008
+ cache_config: The cache config.
2009
+ budget: The budget to chunk the number of tokens to compute.
2010
+ prompt_limit: The maximum number of tokens allowed in a prompt.
2011
+ num_new_tokens: The number of new tokens to schedule.
2012
+
2013
+ Returns:
2014
+ The number of new tokens to schedule after chunking.
2015
+ """
2016
+ remaining_token_budget = budget.remaining_token_budget()
2017
+ if scheduler_config.is_multi_step:
2018
+ # The current multi-step + chunked prefill capability does
2019
+ # not actually support chunking prompts.
2020
+ #
2021
+ # Therefore, `num_new_tokens` is computed in the same fashion
2022
+ # for both multi-step+chunked-prefill &
2023
+ # multi-step+chunked-prefill+APC
2024
+ #
2025
+ # Prompts with more tokens than the current remaining budget
2026
+ # are postponed to future scheduler steps
2027
+ if num_new_tokens > prompt_limit:
2028
+ # If the seq_group is in prompt-stage, pass the
2029
+ # num_new_tokens as-is so the caller can ignore
2030
+ # the sequence.
2031
+ return num_new_tokens
2032
+
2033
+ return 0 if num_new_tokens > \
2034
+ remaining_token_budget else num_new_tokens
2035
+
2036
+ # Get the number of tokens to allocate to this prefill slot
2037
+ prefill_slot_budget = (
2038
+ remaining_token_budget if partial_prefill_metadata is None else
2039
+ partial_prefill_budget_lookup_list[
2040
+ partial_prefill_metadata.schedulable_prefills])
2041
+
2042
+ if cache_config.enable_prefix_caching:
2043
+ # When prefix caching is enabled and we're partially prefilling
2044
+ # a sequence, we always allocate a number of new tokens that is
2045
+ # divisible by the block size to avoid partial block matching.
2046
+ block_size = cache_config.block_size
2047
+ # Don't exceed either the total budget or slot budget.
2048
+ # Take min of those and get the next lowest multiple of the
2049
+ # block size:
2050
+ remaining_token_budget = (
2051
+ min(remaining_token_budget, prefill_slot_budget) //
2052
+ block_size) * block_size
2053
+ # NB: In the case where num_new_tokens < budget, we are
2054
+ # finishing prefill for this sequence, so we do not need to
2055
+ # allocate a full block.
2056
+
2057
+ num_new_tokens = min(num_new_tokens, remaining_token_budget,
2058
+ prefill_slot_budget)
2059
+
2060
+ return num_new_tokens