vllm-cpu-avx512bf16 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1175) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1742 -0
  4. vllm/_ipex_ops.py +243 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +15 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +44 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +33 -0
  16. vllm/assets/video.py +114 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +305 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1494 -0
  23. vllm/attention/backends/flash_attn.py +999 -0
  24. vllm/attention/backends/flashinfer.py +1100 -0
  25. vllm/attention/backends/flashmla.py +242 -0
  26. vllm/attention/backends/hpu_attn.py +309 -0
  27. vllm/attention/backends/ipex_attn.py +394 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1381 -0
  30. vllm/attention/backends/pallas.py +347 -0
  31. vllm/attention/backends/placeholder_attn.py +399 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +970 -0
  34. vllm/attention/backends/torch_sdpa.py +691 -0
  35. vllm/attention/backends/triton_mla.py +113 -0
  36. vllm/attention/backends/utils.py +609 -0
  37. vllm/attention/backends/xformers.py +798 -0
  38. vllm/attention/layer.py +452 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +245 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +367 -0
  45. vllm/attention/ops/flashmla.py +115 -0
  46. vllm/attention/ops/hpu_paged_attn.py +87 -0
  47. vllm/attention/ops/ipex_attn.py +194 -0
  48. vllm/attention/ops/merge_attn_states.py +42 -0
  49. vllm/attention/ops/nki_flash_attn.py +905 -0
  50. vllm/attention/ops/paged_attn.py +255 -0
  51. vllm/attention/ops/prefix_prefill.py +901 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +99 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  54. vllm/attention/ops/triton_decode_attention.py +673 -0
  55. vllm/attention/ops/triton_flash_attention.py +1374 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  57. vllm/attention/ops/triton_unified_attention.py +337 -0
  58. vllm/attention/selector.py +186 -0
  59. vllm/attention/utils/fa_utils.py +54 -0
  60. vllm/beam_search.py +82 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +921 -0
  63. vllm/benchmarks/endpoint_request_func.py +160 -0
  64. vllm/benchmarks/latency.py +184 -0
  65. vllm/benchmarks/serve.py +925 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +69 -0
  68. vllm/collect_env.py +818 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +88 -0
  71. vllm/compilation/backends.py +560 -0
  72. vllm/compilation/base_piecewise_backend.py +71 -0
  73. vllm/compilation/collective_fusion.py +126 -0
  74. vllm/compilation/compiler_interface.py +533 -0
  75. vllm/compilation/counter.py +33 -0
  76. vllm/compilation/cuda_piecewise_backend.py +213 -0
  77. vllm/compilation/decorators.py +249 -0
  78. vllm/compilation/fix_functionalization.py +190 -0
  79. vllm/compilation/fusion.py +617 -0
  80. vllm/compilation/fx_utils.py +61 -0
  81. vllm/compilation/inductor_pass.py +114 -0
  82. vllm/compilation/monitor.py +38 -0
  83. vllm/compilation/multi_output_match.py +108 -0
  84. vllm/compilation/noop_elimination.py +136 -0
  85. vllm/compilation/pass_manager.py +77 -0
  86. vllm/compilation/sequence_parallelism.py +267 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  88. vllm/compilation/vllm_inductor_pass.py +66 -0
  89. vllm/compilation/wrapper.py +129 -0
  90. vllm/config.py +4600 -0
  91. vllm/connections.py +173 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +398 -0
  95. vllm/core/block/common.py +370 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  97. vllm/core/block/interfaces.py +318 -0
  98. vllm/core/block/naive_block.py +465 -0
  99. vllm/core/block/prefix_caching_block.py +1134 -0
  100. vllm/core/block/utils.py +27 -0
  101. vllm/core/block_manager.py +520 -0
  102. vllm/core/evictor.py +156 -0
  103. vllm/core/interfaces.py +134 -0
  104. vllm/core/placeholder_block_space_manager.py +99 -0
  105. vllm/core/scheduler.py +2092 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +280 -0
  108. vllm/distributed/__init__.py +5 -0
  109. vllm/distributed/communication_op.py +40 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +126 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +144 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +167 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +303 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +258 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  120. vllm/distributed/device_communicators/pynccl.py +217 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +541 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  125. vllm/distributed/kv_events.py +296 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +11 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +126 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +202 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +91 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +5 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +259 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +189 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +851 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  152. vllm/distributed/parallel_state.py +1294 -0
  153. vllm/distributed/utils.py +520 -0
  154. vllm/engine/__init__.py +0 -0
  155. vllm/engine/arg_utils.py +1649 -0
  156. vllm/engine/async_llm_engine.py +1274 -0
  157. vllm/engine/async_timeout.py +191 -0
  158. vllm/engine/llm_engine.py +2153 -0
  159. vllm/engine/metrics.py +717 -0
  160. vllm/engine/metrics_types.py +96 -0
  161. vllm/engine/multiprocessing/__init__.py +188 -0
  162. vllm/engine/multiprocessing/client.py +755 -0
  163. vllm/engine/multiprocessing/engine.py +459 -0
  164. vllm/engine/output_processor/__init__.py +0 -0
  165. vllm/engine/output_processor/interfaces.py +74 -0
  166. vllm/engine/output_processor/multi_step.py +215 -0
  167. vllm/engine/output_processor/single_step.py +144 -0
  168. vllm/engine/output_processor/stop_checker.py +130 -0
  169. vllm/engine/output_processor/util.py +27 -0
  170. vllm/engine/protocol.py +310 -0
  171. vllm/entrypoints/__init__.py +0 -0
  172. vllm/entrypoints/api_server.py +177 -0
  173. vllm/entrypoints/chat_utils.py +1298 -0
  174. vllm/entrypoints/cli/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/base.py +38 -0
  177. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  178. vllm/entrypoints/cli/benchmark/main.py +53 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  180. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  181. vllm/entrypoints/cli/collect_env.py +34 -0
  182. vllm/entrypoints/cli/main.py +62 -0
  183. vllm/entrypoints/cli/openai.py +204 -0
  184. vllm/entrypoints/cli/serve.py +141 -0
  185. vllm/entrypoints/cli/types.py +24 -0
  186. vllm/entrypoints/launcher.py +146 -0
  187. vllm/entrypoints/llm.py +1503 -0
  188. vllm/entrypoints/logger.py +49 -0
  189. vllm/entrypoints/openai/__init__.py +0 -0
  190. vllm/entrypoints/openai/api_server.py +1376 -0
  191. vllm/entrypoints/openai/cli_args.py +306 -0
  192. vllm/entrypoints/openai/logits_processors.py +89 -0
  193. vllm/entrypoints/openai/protocol.py +1890 -0
  194. vllm/entrypoints/openai/run_batch.py +439 -0
  195. vllm/entrypoints/openai/serving_chat.py +1192 -0
  196. vllm/entrypoints/openai/serving_classification.py +159 -0
  197. vllm/entrypoints/openai/serving_completion.py +590 -0
  198. vllm/entrypoints/openai/serving_embedding.py +200 -0
  199. vllm/entrypoints/openai/serving_engine.py +985 -0
  200. vllm/entrypoints/openai/serving_models.py +314 -0
  201. vllm/entrypoints/openai/serving_pooling.py +231 -0
  202. vllm/entrypoints/openai/serving_score.py +432 -0
  203. vllm/entrypoints/openai/serving_tokenization.py +151 -0
  204. vllm/entrypoints/openai/serving_transcription.py +421 -0
  205. vllm/entrypoints/openai/tool_parsers/__init__.py +22 -0
  206. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  207. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +369 -0
  208. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +258 -0
  209. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +236 -0
  210. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  211. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +215 -0
  212. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +307 -0
  213. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +302 -0
  214. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +266 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  216. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +111 -0
  217. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +296 -0
  218. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  219. vllm/entrypoints/score_utils.py +49 -0
  220. vllm/entrypoints/ssl.py +74 -0
  221. vllm/entrypoints/utils.py +219 -0
  222. vllm/env_override.py +34 -0
  223. vllm/envs.py +896 -0
  224. vllm/executor/__init__.py +0 -0
  225. vllm/executor/executor_base.py +400 -0
  226. vllm/executor/mp_distributed_executor.py +243 -0
  227. vllm/executor/msgspec_utils.py +29 -0
  228. vllm/executor/multiproc_worker_utils.py +312 -0
  229. vllm/executor/ray_distributed_executor.py +700 -0
  230. vllm/executor/ray_utils.py +398 -0
  231. vllm/executor/uniproc_executor.py +138 -0
  232. vllm/forward_context.py +147 -0
  233. vllm/inputs/__init__.py +40 -0
  234. vllm/inputs/data.py +330 -0
  235. vllm/inputs/parse.py +150 -0
  236. vllm/inputs/preprocess.py +908 -0
  237. vllm/inputs/registry.py +214 -0
  238. vllm/jsontree.py +79 -0
  239. vllm/logger.py +211 -0
  240. vllm/logging_utils/__init__.py +7 -0
  241. vllm/logging_utils/dump_input.py +84 -0
  242. vllm/logging_utils/formatter.py +17 -0
  243. vllm/logits_process.py +118 -0
  244. vllm/lora/__init__.py +0 -0
  245. vllm/lora/fully_sharded_layers.py +354 -0
  246. vllm/lora/layers.py +1284 -0
  247. vllm/lora/lora.py +198 -0
  248. vllm/lora/models.py +817 -0
  249. vllm/lora/ops/__init__.py +0 -0
  250. vllm/lora/ops/torch_ops/__init__.py +15 -0
  251. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  252. vllm/lora/ops/triton_ops/__init__.py +11 -0
  253. vllm/lora/ops/triton_ops/kernel_utils.py +242 -0
  254. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  255. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  256. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  257. vllm/lora/ops/triton_ops/utils.py +119 -0
  258. vllm/lora/ops/xla_ops/__init__.py +6 -0
  259. vllm/lora/ops/xla_ops/lora_ops.py +106 -0
  260. vllm/lora/ops/xla_ops/pallas.py +133 -0
  261. vllm/lora/peft_helper.py +135 -0
  262. vllm/lora/punica_wrapper/__init__.py +9 -0
  263. vllm/lora/punica_wrapper/punica_base.py +484 -0
  264. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  265. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  266. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  267. vllm/lora/punica_wrapper/punica_selector.py +19 -0
  268. vllm/lora/punica_wrapper/punica_tpu.py +325 -0
  269. vllm/lora/punica_wrapper/utils.py +163 -0
  270. vllm/lora/request.py +98 -0
  271. vllm/lora/resolver.py +84 -0
  272. vllm/lora/utils.py +239 -0
  273. vllm/lora/worker_manager.py +253 -0
  274. vllm/model_executor/__init__.py +15 -0
  275. vllm/model_executor/custom_op.py +151 -0
  276. vllm/model_executor/guided_decoding/__init__.py +180 -0
  277. vllm/model_executor/guided_decoding/guidance_decoding.py +62 -0
  278. vllm/model_executor/guided_decoding/guidance_logits_processors.py +103 -0
  279. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  280. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  281. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  282. vllm/model_executor/guided_decoding/outlines_logits_processors.py +283 -0
  283. vllm/model_executor/guided_decoding/utils.py +241 -0
  284. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  285. vllm/model_executor/layers/__init__.py +0 -0
  286. vllm/model_executor/layers/activation.py +368 -0
  287. vllm/model_executor/layers/fused_moe/__init__.py +53 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  449. vllm/model_executor/layers/fused_moe/cutlass_moe.py +382 -0
  450. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +227 -0
  451. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +755 -0
  452. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +231 -0
  453. vllm/model_executor/layers/fused_moe/fused_moe.py +1722 -0
  454. vllm/model_executor/layers/fused_moe/layer.py +1366 -0
  455. vllm/model_executor/layers/fused_moe/modular_kernel.py +364 -0
  456. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +242 -0
  457. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  458. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +188 -0
  459. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  460. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +146 -0
  461. vllm/model_executor/layers/fused_moe/prepare_finalize.py +60 -0
  462. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +372 -0
  463. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +112 -0
  464. vllm/model_executor/layers/fused_moe/utils.py +97 -0
  465. vllm/model_executor/layers/layernorm.py +287 -0
  466. vllm/model_executor/layers/lightning_attn.py +651 -0
  467. vllm/model_executor/layers/linear.py +1523 -0
  468. vllm/model_executor/layers/logits_processor.py +196 -0
  469. vllm/model_executor/layers/mamba/__init__.py +0 -0
  470. vllm/model_executor/layers/mamba/mamba2_metadata.py +124 -0
  471. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  472. vllm/model_executor/layers/mamba/mamba_mixer2.py +615 -0
  473. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  474. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  475. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +413 -0
  476. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  477. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  478. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  479. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  480. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  481. vllm/model_executor/layers/pooler.py +343 -0
  482. vllm/model_executor/layers/quantization/__init__.py +156 -0
  483. vllm/model_executor/layers/quantization/aqlm.py +375 -0
  484. vllm/model_executor/layers/quantization/auto_round.py +308 -0
  485. vllm/model_executor/layers/quantization/awq.py +185 -0
  486. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  487. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  488. vllm/model_executor/layers/quantization/base_config.py +150 -0
  489. vllm/model_executor/layers/quantization/bitblas.py +460 -0
  490. vllm/model_executor/layers/quantization/bitsandbytes.py +397 -0
  491. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  492. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +644 -0
  493. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1252 -0
  494. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +21 -0
  495. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  496. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  497. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  498. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +92 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +120 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +214 -0
  505. vllm/model_executor/layers/quantization/deepspeedfp.py +194 -0
  506. vllm/model_executor/layers/quantization/experts_int8.py +195 -0
  507. vllm/model_executor/layers/quantization/fbgemm_fp8.py +171 -0
  508. vllm/model_executor/layers/quantization/fp8.py +876 -0
  509. vllm/model_executor/layers/quantization/gguf.py +564 -0
  510. vllm/model_executor/layers/quantization/gptq.py +277 -0
  511. vllm/model_executor/layers/quantization/gptq_bitblas.py +444 -0
  512. vllm/model_executor/layers/quantization/gptq_marlin.py +647 -0
  513. vllm/model_executor/layers/quantization/gptq_marlin_24.py +296 -0
  514. vllm/model_executor/layers/quantization/hqq_marlin.py +331 -0
  515. vllm/model_executor/layers/quantization/ipex_quant.py +249 -0
  516. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  517. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  518. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  519. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  520. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  521. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  522. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  523. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +130 -0
  524. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  525. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  526. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  527. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  528. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  529. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  530. vllm/model_executor/layers/quantization/kv_cache.py +138 -0
  531. vllm/model_executor/layers/quantization/marlin.py +260 -0
  532. vllm/model_executor/layers/quantization/modelopt.py +734 -0
  533. vllm/model_executor/layers/quantization/moe_wna16.py +448 -0
  534. vllm/model_executor/layers/quantization/neuron_quant.py +68 -0
  535. vllm/model_executor/layers/quantization/ptpc_fp8.py +126 -0
  536. vllm/model_executor/layers/quantization/qqq.py +274 -0
  537. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  538. vllm/model_executor/layers/quantization/quark/quark.py +440 -0
  539. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  540. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +8 -0
  541. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  542. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +125 -0
  543. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +145 -0
  544. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  545. vllm/model_executor/layers/quantization/quark/utils.py +104 -0
  546. vllm/model_executor/layers/quantization/schema.py +85 -0
  547. vllm/model_executor/layers/quantization/torchao.py +143 -0
  548. vllm/model_executor/layers/quantization/tpu_int8.py +120 -0
  549. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  550. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  551. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +207 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  754. vllm/model_executor/layers/quantization/utils/fp8_utils.py +611 -0
  755. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  756. vllm/model_executor/layers/quantization/utils/int8_utils.py +484 -0
  757. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  758. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  759. vllm/model_executor/layers/quantization/utils/marlin_utils.py +475 -0
  760. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +277 -0
  761. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +324 -0
  762. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  763. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +463 -0
  764. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +125 -0
  765. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +44 -0
  766. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +61 -0
  767. vllm/model_executor/layers/quantization/utils/quant_utils.py +572 -0
  768. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  769. vllm/model_executor/layers/rejection_sampler.py +405 -0
  770. vllm/model_executor/layers/resampler.py +269 -0
  771. vllm/model_executor/layers/rotary_embedding.py +1861 -0
  772. vllm/model_executor/layers/sampler.py +1203 -0
  773. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  774. vllm/model_executor/layers/typical_acceptance_sampler.py +165 -0
  775. vllm/model_executor/layers/utils.py +99 -0
  776. vllm/model_executor/layers/vocab_parallel_embedding.py +486 -0
  777. vllm/model_executor/model_loader/__init__.py +75 -0
  778. vllm/model_executor/model_loader/base_loader.py +24 -0
  779. vllm/model_executor/model_loader/bitsandbytes_loader.py +582 -0
  780. vllm/model_executor/model_loader/default_loader.py +295 -0
  781. vllm/model_executor/model_loader/dummy_loader.py +37 -0
  782. vllm/model_executor/model_loader/gguf_loader.py +113 -0
  783. vllm/model_executor/model_loader/neuron.py +475 -0
  784. vllm/model_executor/model_loader/neuronx_distributed.py +622 -0
  785. vllm/model_executor/model_loader/runai_streamer_loader.py +120 -0
  786. vllm/model_executor/model_loader/sharded_state_loader.py +211 -0
  787. vllm/model_executor/model_loader/tensorizer.py +632 -0
  788. vllm/model_executor/model_loader/tensorizer_loader.py +122 -0
  789. vllm/model_executor/model_loader/utils.py +301 -0
  790. vllm/model_executor/model_loader/weight_utils.py +781 -0
  791. vllm/model_executor/models/__init__.py +27 -0
  792. vllm/model_executor/models/adapters.py +247 -0
  793. vllm/model_executor/models/aimv2.py +199 -0
  794. vllm/model_executor/models/arctic.py +558 -0
  795. vllm/model_executor/models/aria.py +656 -0
  796. vllm/model_executor/models/aya_vision.py +461 -0
  797. vllm/model_executor/models/baichuan.py +473 -0
  798. vllm/model_executor/models/bamba.py +542 -0
  799. vllm/model_executor/models/bart.py +937 -0
  800. vllm/model_executor/models/bert.py +517 -0
  801. vllm/model_executor/models/bert_with_rope.py +714 -0
  802. vllm/model_executor/models/blip.py +338 -0
  803. vllm/model_executor/models/blip2.py +717 -0
  804. vllm/model_executor/models/bloom.py +372 -0
  805. vllm/model_executor/models/chameleon.py +1135 -0
  806. vllm/model_executor/models/chatglm.py +477 -0
  807. vllm/model_executor/models/clip.py +411 -0
  808. vllm/model_executor/models/commandr.py +471 -0
  809. vllm/model_executor/models/constant_size_cache.py +136 -0
  810. vllm/model_executor/models/dbrx.py +471 -0
  811. vllm/model_executor/models/deepseek.py +485 -0
  812. vllm/model_executor/models/deepseek_mtp.py +268 -0
  813. vllm/model_executor/models/deepseek_v2.py +842 -0
  814. vllm/model_executor/models/deepseek_vl2.py +647 -0
  815. vllm/model_executor/models/eagle.py +259 -0
  816. vllm/model_executor/models/exaone.py +550 -0
  817. vllm/model_executor/models/fairseq2_llama.py +153 -0
  818. vllm/model_executor/models/falcon.py +509 -0
  819. vllm/model_executor/models/falcon_h1.py +684 -0
  820. vllm/model_executor/models/florence2.py +1102 -0
  821. vllm/model_executor/models/fuyu.py +388 -0
  822. vllm/model_executor/models/gemma.py +424 -0
  823. vllm/model_executor/models/gemma2.py +424 -0
  824. vllm/model_executor/models/gemma3.py +532 -0
  825. vllm/model_executor/models/gemma3_mm.py +708 -0
  826. vllm/model_executor/models/glm.py +22 -0
  827. vllm/model_executor/models/glm4.py +304 -0
  828. vllm/model_executor/models/glm4v.py +647 -0
  829. vllm/model_executor/models/gpt2.py +327 -0
  830. vllm/model_executor/models/gpt_bigcode.py +334 -0
  831. vllm/model_executor/models/gpt_j.py +338 -0
  832. vllm/model_executor/models/gpt_neox.py +331 -0
  833. vllm/model_executor/models/granite.py +492 -0
  834. vllm/model_executor/models/granite_speech.py +778 -0
  835. vllm/model_executor/models/granitemoe.py +436 -0
  836. vllm/model_executor/models/granitemoehybrid.py +585 -0
  837. vllm/model_executor/models/granitemoeshared.py +340 -0
  838. vllm/model_executor/models/gritlm.py +223 -0
  839. vllm/model_executor/models/grok1.py +545 -0
  840. vllm/model_executor/models/h2ovl.py +545 -0
  841. vllm/model_executor/models/idefics2_vision_model.py +388 -0
  842. vllm/model_executor/models/idefics3.py +767 -0
  843. vllm/model_executor/models/interfaces.py +571 -0
  844. vllm/model_executor/models/interfaces_base.py +163 -0
  845. vllm/model_executor/models/intern_vit.py +475 -0
  846. vllm/model_executor/models/internlm2.py +454 -0
  847. vllm/model_executor/models/internlm2_ve.py +146 -0
  848. vllm/model_executor/models/internvl.py +1405 -0
  849. vllm/model_executor/models/jais.py +372 -0
  850. vllm/model_executor/models/jamba.py +591 -0
  851. vllm/model_executor/models/kimi_vl.py +576 -0
  852. vllm/model_executor/models/llama.py +643 -0
  853. vllm/model_executor/models/llama4.py +531 -0
  854. vllm/model_executor/models/llama_eagle.py +166 -0
  855. vllm/model_executor/models/llama_eagle3.py +257 -0
  856. vllm/model_executor/models/llava.py +865 -0
  857. vllm/model_executor/models/llava_next.py +585 -0
  858. vllm/model_executor/models/llava_next_video.py +470 -0
  859. vllm/model_executor/models/llava_onevision.py +955 -0
  860. vllm/model_executor/models/mamba.py +272 -0
  861. vllm/model_executor/models/mamba2.py +302 -0
  862. vllm/model_executor/models/mamba_cache.py +75 -0
  863. vllm/model_executor/models/medusa.py +218 -0
  864. vllm/model_executor/models/mimo.py +191 -0
  865. vllm/model_executor/models/mimo_mtp.py +284 -0
  866. vllm/model_executor/models/minicpm.py +590 -0
  867. vllm/model_executor/models/minicpm3.py +229 -0
  868. vllm/model_executor/models/minicpmo.py +758 -0
  869. vllm/model_executor/models/minicpmv.py +1286 -0
  870. vllm/model_executor/models/minimax_cache.py +35 -0
  871. vllm/model_executor/models/minimax_text_01.py +1303 -0
  872. vllm/model_executor/models/minimax_vl_01.py +363 -0
  873. vllm/model_executor/models/mistral3.py +603 -0
  874. vllm/model_executor/models/mixtral.py +487 -0
  875. vllm/model_executor/models/mixtral_quant.py +452 -0
  876. vllm/model_executor/models/mllama.py +1623 -0
  877. vllm/model_executor/models/mllama4.py +838 -0
  878. vllm/model_executor/models/mlp_speculator.py +205 -0
  879. vllm/model_executor/models/modernbert.py +329 -0
  880. vllm/model_executor/models/module_mapping.py +71 -0
  881. vllm/model_executor/models/molmo.py +1567 -0
  882. vllm/model_executor/models/moonvit.py +629 -0
  883. vllm/model_executor/models/mpt.py +330 -0
  884. vllm/model_executor/models/nemotron.py +507 -0
  885. vllm/model_executor/models/nemotron_nas.py +483 -0
  886. vllm/model_executor/models/nvlm_d.py +215 -0
  887. vllm/model_executor/models/olmo.py +388 -0
  888. vllm/model_executor/models/olmo2.py +413 -0
  889. vllm/model_executor/models/olmoe.py +446 -0
  890. vllm/model_executor/models/opt.py +411 -0
  891. vllm/model_executor/models/orion.py +348 -0
  892. vllm/model_executor/models/ovis.py +554 -0
  893. vllm/model_executor/models/paligemma.py +397 -0
  894. vllm/model_executor/models/persimmon.py +343 -0
  895. vllm/model_executor/models/phi.py +355 -0
  896. vllm/model_executor/models/phi3.py +18 -0
  897. vllm/model_executor/models/phi3_small.py +464 -0
  898. vllm/model_executor/models/phi3v.py +722 -0
  899. vllm/model_executor/models/phi4mm.py +1245 -0
  900. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  901. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  902. vllm/model_executor/models/phimoe.py +664 -0
  903. vllm/model_executor/models/pixtral.py +1315 -0
  904. vllm/model_executor/models/plamo2.py +737 -0
  905. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  906. vllm/model_executor/models/qwen.py +361 -0
  907. vllm/model_executor/models/qwen2.py +567 -0
  908. vllm/model_executor/models/qwen2_5_omni_thinker.py +903 -0
  909. vllm/model_executor/models/qwen2_5_vl.py +1171 -0
  910. vllm/model_executor/models/qwen2_audio.py +409 -0
  911. vllm/model_executor/models/qwen2_moe.py +539 -0
  912. vllm/model_executor/models/qwen2_rm.py +131 -0
  913. vllm/model_executor/models/qwen2_vl.py +1410 -0
  914. vllm/model_executor/models/qwen3.py +320 -0
  915. vllm/model_executor/models/qwen3_moe.py +534 -0
  916. vllm/model_executor/models/qwen_vl.py +784 -0
  917. vllm/model_executor/models/registry.py +618 -0
  918. vllm/model_executor/models/roberta.py +273 -0
  919. vllm/model_executor/models/siglip.py +523 -0
  920. vllm/model_executor/models/skyworkr1v.py +950 -0
  921. vllm/model_executor/models/smolvlm.py +51 -0
  922. vllm/model_executor/models/solar.py +505 -0
  923. vllm/model_executor/models/stablelm.py +342 -0
  924. vllm/model_executor/models/starcoder2.py +355 -0
  925. vllm/model_executor/models/telechat2.py +139 -0
  926. vllm/model_executor/models/teleflm.py +78 -0
  927. vllm/model_executor/models/transformers.py +507 -0
  928. vllm/model_executor/models/ultravox.py +655 -0
  929. vllm/model_executor/models/utils.py +730 -0
  930. vllm/model_executor/models/vision.py +146 -0
  931. vllm/model_executor/models/whisper.py +746 -0
  932. vllm/model_executor/models/zamba2.py +1008 -0
  933. vllm/model_executor/parameter.py +458 -0
  934. vllm/model_executor/pooling_metadata.py +71 -0
  935. vllm/model_executor/sampling_metadata.py +596 -0
  936. vllm/model_executor/utils.py +53 -0
  937. vllm/multimodal/__init__.py +32 -0
  938. vllm/multimodal/audio.py +105 -0
  939. vllm/multimodal/base.py +218 -0
  940. vllm/multimodal/hasher.py +117 -0
  941. vllm/multimodal/image.py +96 -0
  942. vllm/multimodal/inputs.py +872 -0
  943. vllm/multimodal/parse.py +460 -0
  944. vllm/multimodal/processing.py +1894 -0
  945. vllm/multimodal/profiling.py +273 -0
  946. vllm/multimodal/registry.py +330 -0
  947. vllm/multimodal/utils.py +392 -0
  948. vllm/multimodal/video.py +197 -0
  949. vllm/outputs.py +525 -0
  950. vllm/platforms/__init__.py +290 -0
  951. vllm/platforms/cpu.py +205 -0
  952. vllm/platforms/cuda.py +461 -0
  953. vllm/platforms/hpu.py +105 -0
  954. vllm/platforms/interface.py +492 -0
  955. vllm/platforms/neuron.py +152 -0
  956. vllm/platforms/rocm.py +388 -0
  957. vllm/platforms/tpu.py +215 -0
  958. vllm/platforms/xpu.py +155 -0
  959. vllm/plugins/__init__.py +86 -0
  960. vllm/plugins/lora_resolvers/README.md +15 -0
  961. vllm/plugins/lora_resolvers/__init__.py +0 -0
  962. vllm/plugins/lora_resolvers/filesystem_resolver.py +49 -0
  963. vllm/pooling_params.py +53 -0
  964. vllm/profiler/__init__.py +0 -0
  965. vllm/profiler/layerwise_profile.py +374 -0
  966. vllm/profiler/utils.py +147 -0
  967. vllm/prompt_adapter/__init__.py +0 -0
  968. vllm/prompt_adapter/layers.py +82 -0
  969. vllm/prompt_adapter/models.py +357 -0
  970. vllm/prompt_adapter/request.py +36 -0
  971. vllm/prompt_adapter/utils.py +97 -0
  972. vllm/prompt_adapter/worker_manager.py +178 -0
  973. vllm/py.typed +2 -0
  974. vllm/reasoning/__init__.py +14 -0
  975. vllm/reasoning/abs_reasoning_parsers.py +191 -0
  976. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  977. vllm/reasoning/granite_reasoning_parser.py +362 -0
  978. vllm/reasoning/qwen3_reasoning_parser.py +150 -0
  979. vllm/sampling_params.py +590 -0
  980. vllm/scalar_type.py +346 -0
  981. vllm/scripts.py +14 -0
  982. vllm/sequence.py +1567 -0
  983. vllm/spec_decode/__init__.py +0 -0
  984. vllm/spec_decode/batch_expansion.py +505 -0
  985. vllm/spec_decode/draft_model_runner.py +349 -0
  986. vllm/spec_decode/interfaces.py +98 -0
  987. vllm/spec_decode/medusa_worker.py +137 -0
  988. vllm/spec_decode/metrics.py +212 -0
  989. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  990. vllm/spec_decode/mqa_scorer.py +159 -0
  991. vllm/spec_decode/multi_step_worker.py +422 -0
  992. vllm/spec_decode/ngram_worker.py +195 -0
  993. vllm/spec_decode/proposer_worker_base.py +58 -0
  994. vllm/spec_decode/smaller_tp_proposer_worker.py +195 -0
  995. vllm/spec_decode/spec_decode_worker.py +1325 -0
  996. vllm/spec_decode/target_model_runner.py +44 -0
  997. vllm/spec_decode/top1_proposer.py +274 -0
  998. vllm/spec_decode/util.py +276 -0
  999. vllm/test_utils.py +129 -0
  1000. vllm/third_party/__init__.py +0 -0
  1001. vllm/third_party/pynvml.py +6139 -0
  1002. vllm/tracing.py +130 -0
  1003. vllm/transformers_utils/__init__.py +23 -0
  1004. vllm/transformers_utils/chat_templates/__init__.py +4 -0
  1005. vllm/transformers_utils/chat_templates/registry.py +59 -0
  1006. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1007. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1008. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1009. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1010. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1011. vllm/transformers_utils/config.py +835 -0
  1012. vllm/transformers_utils/configs/__init__.py +58 -0
  1013. vllm/transformers_utils/configs/arctic.py +206 -0
  1014. vllm/transformers_utils/configs/chatglm.py +71 -0
  1015. vllm/transformers_utils/configs/cohere2.py +194 -0
  1016. vllm/transformers_utils/configs/dbrx.py +279 -0
  1017. vllm/transformers_utils/configs/deepseek_vl2.py +215 -0
  1018. vllm/transformers_utils/configs/eagle.py +84 -0
  1019. vllm/transformers_utils/configs/exaone.py +189 -0
  1020. vllm/transformers_utils/configs/falcon.py +89 -0
  1021. vllm/transformers_utils/configs/h2ovl.py +15 -0
  1022. vllm/transformers_utils/configs/internvl.py +53 -0
  1023. vllm/transformers_utils/configs/jais.py +237 -0
  1024. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  1025. vllm/transformers_utils/configs/medusa.py +62 -0
  1026. vllm/transformers_utils/configs/minimax_text_01.py +69 -0
  1027. vllm/transformers_utils/configs/minimax_vl_01.py +70 -0
  1028. vllm/transformers_utils/configs/mllama.py +30 -0
  1029. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  1030. vllm/transformers_utils/configs/moonvit.py +32 -0
  1031. vllm/transformers_utils/configs/mpt.py +179 -0
  1032. vllm/transformers_utils/configs/nemotron.py +204 -0
  1033. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  1034. vllm/transformers_utils/configs/ovis.py +183 -0
  1035. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  1036. vllm/transformers_utils/configs/solar.py +246 -0
  1037. vllm/transformers_utils/configs/telechat2.py +63 -0
  1038. vllm/transformers_utils/configs/ultravox.py +107 -0
  1039. vllm/transformers_utils/detokenizer.py +167 -0
  1040. vllm/transformers_utils/detokenizer_utils.py +188 -0
  1041. vllm/transformers_utils/processor.py +220 -0
  1042. vllm/transformers_utils/processors/__init__.py +7 -0
  1043. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1044. vllm/transformers_utils/processors/ovis.py +419 -0
  1045. vllm/transformers_utils/s3_utils.py +161 -0
  1046. vllm/transformers_utils/tokenizer.py +301 -0
  1047. vllm/transformers_utils/tokenizer_base.py +148 -0
  1048. vllm/transformers_utils/tokenizer_group.py +119 -0
  1049. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  1050. vllm/transformers_utils/tokenizers/mistral.py +490 -0
  1051. vllm/transformers_utils/utils.py +98 -0
  1052. vllm/triton_utils/__init__.py +13 -0
  1053. vllm/triton_utils/importing.py +49 -0
  1054. vllm/usage/__init__.py +0 -0
  1055. vllm/usage/usage_lib.py +255 -0
  1056. vllm/utils.py +2844 -0
  1057. vllm/v1/__init__.py +0 -0
  1058. vllm/v1/attention/__init__.py +0 -0
  1059. vllm/v1/attention/backends/__init__.py +0 -0
  1060. vllm/v1/attention/backends/flash_attn.py +833 -0
  1061. vllm/v1/attention/backends/flashinfer.py +639 -0
  1062. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1063. vllm/v1/attention/backends/mla/common.py +926 -0
  1064. vllm/v1/attention/backends/mla/flashmla.py +150 -0
  1065. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +221 -0
  1066. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1067. vllm/v1/attention/backends/pallas.py +235 -0
  1068. vllm/v1/attention/backends/triton_attn.py +279 -0
  1069. vllm/v1/attention/backends/utils.py +18 -0
  1070. vllm/v1/core/__init__.py +0 -0
  1071. vllm/v1/core/block_pool.py +328 -0
  1072. vllm/v1/core/encoder_cache_manager.py +149 -0
  1073. vllm/v1/core/kv_cache_manager.py +372 -0
  1074. vllm/v1/core/kv_cache_utils.py +748 -0
  1075. vllm/v1/core/sched/__init__.py +0 -0
  1076. vllm/v1/core/sched/interface.py +143 -0
  1077. vllm/v1/core/sched/output.py +153 -0
  1078. vllm/v1/core/sched/scheduler.py +1015 -0
  1079. vllm/v1/core/sched/utils.py +22 -0
  1080. vllm/v1/core/single_type_kv_cache_manager.py +358 -0
  1081. vllm/v1/engine/__init__.py +171 -0
  1082. vllm/v1/engine/async_llm.py +546 -0
  1083. vllm/v1/engine/core.py +801 -0
  1084. vllm/v1/engine/core_client.py +1020 -0
  1085. vllm/v1/engine/detokenizer.py +260 -0
  1086. vllm/v1/engine/exceptions.py +16 -0
  1087. vllm/v1/engine/llm_engine.py +316 -0
  1088. vllm/v1/engine/logprobs.py +198 -0
  1089. vllm/v1/engine/mm_input_cache.py +90 -0
  1090. vllm/v1/engine/output_processor.py +427 -0
  1091. vllm/v1/engine/parallel_sampling.py +132 -0
  1092. vllm/v1/engine/processor.py +398 -0
  1093. vllm/v1/executor/__init__.py +0 -0
  1094. vllm/v1/executor/abstract.py +112 -0
  1095. vllm/v1/executor/multiproc_executor.py +532 -0
  1096. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1097. vllm/v1/kv_cache_interface.py +208 -0
  1098. vllm/v1/metrics/__init__.py +0 -0
  1099. vllm/v1/metrics/loggers.py +511 -0
  1100. vllm/v1/metrics/ray_wrappers.py +120 -0
  1101. vllm/v1/metrics/reader.py +245 -0
  1102. vllm/v1/metrics/stats.py +238 -0
  1103. vllm/v1/outputs.py +115 -0
  1104. vllm/v1/request.py +191 -0
  1105. vllm/v1/sample/__init__.py +0 -0
  1106. vllm/v1/sample/metadata.py +43 -0
  1107. vllm/v1/sample/ops/__init__.py +0 -0
  1108. vllm/v1/sample/ops/bad_words.py +38 -0
  1109. vllm/v1/sample/ops/penalties.py +58 -0
  1110. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1111. vllm/v1/sample/rejection_sampler.py +630 -0
  1112. vllm/v1/sample/sampler.py +270 -0
  1113. vllm/v1/sample/tpu/__init__.py +0 -0
  1114. vllm/v1/sample/tpu/metadata.py +123 -0
  1115. vllm/v1/sample/tpu/sampler.py +144 -0
  1116. vllm/v1/serial_utils.py +313 -0
  1117. vllm/v1/spec_decode/__init__.py +0 -0
  1118. vllm/v1/spec_decode/eagle.py +424 -0
  1119. vllm/v1/spec_decode/medusa.py +61 -0
  1120. vllm/v1/spec_decode/metadata.py +61 -0
  1121. vllm/v1/spec_decode/metrics.py +177 -0
  1122. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1123. vllm/v1/spec_decode/utils.py +45 -0
  1124. vllm/v1/structured_output/__init__.py +215 -0
  1125. vllm/v1/structured_output/backend_guidance.py +244 -0
  1126. vllm/v1/structured_output/backend_types.py +133 -0
  1127. vllm/v1/structured_output/backend_xgrammar.py +317 -0
  1128. vllm/v1/structured_output/request.py +85 -0
  1129. vllm/v1/structured_output/utils.py +174 -0
  1130. vllm/v1/utils.py +294 -0
  1131. vllm/v1/worker/__init__.py +0 -0
  1132. vllm/v1/worker/block_table.py +139 -0
  1133. vllm/v1/worker/gpu_input_batch.py +680 -0
  1134. vllm/v1/worker/gpu_model_runner.py +2084 -0
  1135. vllm/v1/worker/gpu_worker.py +373 -0
  1136. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1137. vllm/v1/worker/tpu_model_runner.py +1510 -0
  1138. vllm/v1/worker/tpu_worker.py +276 -0
  1139. vllm/v1/worker/utils.py +74 -0
  1140. vllm/v1/worker/worker_base.py +64 -0
  1141. vllm/version.py +40 -0
  1142. vllm/vllm_flash_attn/.gitkeep +0 -0
  1143. vllm/worker/__init__.py +0 -0
  1144. vllm/worker/cache_engine.py +144 -0
  1145. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1146. vllm/worker/cpu_model_runner.py +671 -0
  1147. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1148. vllm/worker/cpu_worker.py +400 -0
  1149. vllm/worker/enc_dec_model_runner.py +555 -0
  1150. vllm/worker/hpu_model_runner.py +2319 -0
  1151. vllm/worker/hpu_worker.py +483 -0
  1152. vllm/worker/model_runner.py +2178 -0
  1153. vllm/worker/model_runner_base.py +281 -0
  1154. vllm/worker/multi_step_hpu_worker.py +122 -0
  1155. vllm/worker/multi_step_model_runner.py +910 -0
  1156. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1157. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1158. vllm/worker/multi_step_tpu_worker.py +107 -0
  1159. vllm/worker/multi_step_worker.py +196 -0
  1160. vllm/worker/neuron_model_runner.py +418 -0
  1161. vllm/worker/neuron_worker.py +158 -0
  1162. vllm/worker/neuronx_distributed_model_runner.py +136 -0
  1163. vllm/worker/pooling_model_runner.py +211 -0
  1164. vllm/worker/tpu_model_runner.py +908 -0
  1165. vllm/worker/tpu_worker.py +336 -0
  1166. vllm/worker/utils.py +52 -0
  1167. vllm/worker/worker.py +574 -0
  1168. vllm/worker/worker_base.py +644 -0
  1169. vllm/worker/xpu_model_runner.py +606 -0
  1170. vllm/worker/xpu_worker.py +185 -0
  1171. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/METADATA +335 -0
  1172. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/RECORD +1175 -0
  1173. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/WHEEL +5 -0
  1174. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/entry_points.txt +5 -0
  1175. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/top_level.txt +1 -0
vllm/core/scheduler.py ADDED
@@ -0,0 +1,2092 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import enum
4
+ import os
5
+ import random
6
+ import time
7
+ from collections import deque
8
+ from dataclasses import dataclass, field
9
+ from typing import Callable, Deque, Dict, Iterable, List, Optional
10
+ from typing import Sequence as GenericSequence
11
+ from typing import Set, Tuple, Union
12
+
13
+ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
14
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
15
+ from vllm.logger import init_logger
16
+ from vllm.lora.request import LoRARequest
17
+ from vllm.prompt_adapter.request import PromptAdapterRequest
18
+ from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
19
+ SequenceGroupBase, SequenceGroupMetadata,
20
+ SequenceGroupMetadataDelta, SequenceStage,
21
+ SequenceStatus)
22
+ from vllm.utils import Device, PyObjectCache
23
+
24
+ logger = init_logger(__name__)
25
+
26
+ # Test-only. If configured, decode is preempted with
27
+ # ARTIFICIAL_PREEMPTION_PROB% probability.
28
+ ENABLE_ARTIFICIAL_PREEMPT = bool(
29
+ os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa
30
+ ARTIFICIAL_PREEMPTION_PROB = 0.5
31
+ ARTIFICIAL_PREEMPTION_MAX_CNT = 500
32
+
33
+
34
+ class PreemptionMode(enum.Enum):
35
+ """Preemption modes.
36
+
37
+ 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
38
+ and swap them back in when the sequences are resumed.
39
+ 2. Recomputation: Discard the blocks of the preempted sequences and
40
+ recompute them when the sequences are resumed, treating the sequences as
41
+ new prompts.
42
+ """
43
+
44
+ SWAP = enum.auto()
45
+ RECOMPUTE = enum.auto()
46
+
47
+
48
+ @dataclass
49
+ class SchedulingBudget:
50
+ """The available slots for scheduling.
51
+
52
+ TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
53
+ budget update from the same request_id. It is because in normal scheduling
54
+ path, we update RUNNING num_seqs ahead of time, meaning it could be
55
+ updated more than once when scheduling RUNNING requests. Since this won't
56
+ happen if we only have chunked prefill scheduling, we can remove this
57
+ feature from the API when chunked prefill is enabled by default.
58
+ """
59
+
60
+ token_budget: int
61
+ max_num_seqs: int
62
+ _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
63
+ _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
64
+ # Number of cached tokens in the batch.
65
+ _num_cached_tokens: int = 0
66
+ # Number of actual non-cached tokens in the batch.
67
+ _num_batched_tokens: int = 0
68
+ _num_curr_seqs: int = 0
69
+
70
+ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
71
+ # We allow num_new_tokens to be 0 when the entire sequence has
72
+ # been cached.
73
+ assert num_new_tokens >= 0
74
+ assert num_new_seqs != 0
75
+ return (self.num_batched_tokens + num_new_tokens <= self.token_budget
76
+ and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
77
+
78
+ def remaining_token_budget(self):
79
+ return self.token_budget - self.num_batched_tokens
80
+
81
+ def add_num_batched_tokens(self,
82
+ req_id: str,
83
+ num_batched_tokens: int,
84
+ num_cached_tokens: int = 0):
85
+ if req_id in self._request_ids_num_batched_tokens:
86
+ return
87
+ assert num_cached_tokens >= 0
88
+ assert num_batched_tokens >= 0
89
+
90
+ self._request_ids_num_batched_tokens.add(req_id)
91
+ self._num_batched_tokens += num_batched_tokens
92
+ self._num_cached_tokens += num_cached_tokens
93
+
94
+ def subtract_num_batched_tokens(self, req_id: str,
95
+ num_batched_tokens: int):
96
+ if req_id in self._request_ids_num_batched_tokens:
97
+ self._request_ids_num_batched_tokens.remove(req_id)
98
+ self._num_batched_tokens -= num_batched_tokens
99
+
100
+ def add_num_seqs(self, req_id: str, num_curr_seqs: int):
101
+ if req_id in self._request_ids_num_curr_seqs:
102
+ return
103
+
104
+ self._request_ids_num_curr_seqs.add(req_id)
105
+ self._num_curr_seqs += num_curr_seqs
106
+
107
+ def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
108
+ if req_id in self._request_ids_num_curr_seqs:
109
+ self._request_ids_num_curr_seqs.remove(req_id)
110
+ self._num_curr_seqs -= num_curr_seqs
111
+
112
+ @property
113
+ def num_batched_tokens(self):
114
+ return self._num_batched_tokens
115
+
116
+ @property
117
+ def num_curr_seqs(self):
118
+ return self._num_curr_seqs
119
+
120
+ @property
121
+ def num_cached_tokens(self):
122
+ return self._num_cached_tokens
123
+
124
+
125
+ @dataclass
126
+ class ScheduledSequenceGroup:
127
+ # A sequence group that's scheduled.
128
+ seq_group: SequenceGroup
129
+ # The total chunk size (number of tokens) to process for next iteration.
130
+ # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
131
+ # chunked, it can be smaller than that.
132
+ token_chunk_size: int
133
+
134
+
135
+ @dataclass
136
+ class SchedulerOutputs:
137
+ """The scheduling decision made from a scheduler."""
138
+
139
+ # Scheduled sequence groups.
140
+ scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
141
+ # Number of prefill groups scheduled.
142
+ num_prefill_groups: int
143
+ # Total number of batched tokens.
144
+ num_batched_tokens: int
145
+ # Blocks to swap in. List of CPU -> GPU block number.
146
+ blocks_to_swap_in: List[Tuple[int, int]]
147
+ # Blocks to swap out. List of GPU -> CPU block number.
148
+ blocks_to_swap_out: List[Tuple[int, int]]
149
+ # Blocks to copy. Source to dest block.
150
+ blocks_to_copy: List[Tuple[int, int]]
151
+ # Sequence groups that are going to be ignored.
152
+ ignored_seq_groups: List[SequenceGroup]
153
+ # The number of slots for lookahead decoding.
154
+ num_lookahead_slots: int
155
+ # The number of requests in the running queue
156
+ running_queue_size: int
157
+ preempted: int
158
+
159
+ def __post_init__(self):
160
+ # Swap in and swap out should never happen at the same time.
161
+ assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
162
+
163
+ self.num_loras: int = len(self.lora_requests)
164
+ if self.num_loras > 0:
165
+ self._sort_by_lora_ids()
166
+
167
+ self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
168
+
169
+ def is_empty(self) -> bool:
170
+ # NOTE: We do not consider the ignored sequence groups.
171
+ return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
172
+ and not self.blocks_to_swap_out and not self.blocks_to_copy)
173
+
174
+ def _sort_by_lora_ids(self):
175
+ assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
176
+
177
+ def key_fn(group: ScheduledSequenceGroup):
178
+ key = (group.seq_group.lora_int_id, group.seq_group.request_id)
179
+ if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
180
+ # Sort sequence groups so that all prefills come before all
181
+ # decodes as required by chunked prefill.
182
+ return (not group.seq_group.is_prefill(), *key)
183
+ return key
184
+
185
+ self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
186
+ key=key_fn)
187
+
188
+ @property
189
+ def lora_requests(self) -> Set[LoRARequest]:
190
+ return {
191
+ g.seq_group.lora_request
192
+ for g in self.scheduled_seq_groups
193
+ if g.seq_group.lora_request is not None
194
+ }
195
+
196
+ @property
197
+ def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
198
+ return {
199
+ g.seq_group.prompt_adapter_request
200
+ for g in self.scheduled_seq_groups
201
+ if g.seq_group.prompt_adapter_request is not None
202
+ }
203
+
204
+
205
+ @dataclass
206
+ class SchedulerRunningOutputs:
207
+ """The requests that are scheduled from a running queue.
208
+
209
+ Could contain prefill (prefill that's chunked) or decodes. If there's not
210
+ enough memory, it can be preempted (for recompute) or swapped out.
211
+ """
212
+
213
+ # Selected sequences that are running and in a decoding phase.
214
+ decode_seq_groups: List[ScheduledSequenceGroup]
215
+ # Selected sequences that are running and in a prefill phase.
216
+ # I.e., it means the prefill has been chunked.
217
+ prefill_seq_groups: List[ScheduledSequenceGroup]
218
+ # The preempted sequences.
219
+ preempted: List[SequenceGroup]
220
+ # Sequences that are swapped out.
221
+ swapped_out: List[SequenceGroup]
222
+ # The blocks to swap out.
223
+ blocks_to_swap_out: List[Tuple[int, int]]
224
+ # The blocks to copy.
225
+ blocks_to_copy: List[Tuple[int, int]]
226
+ # The number of slots for lookahead decoding.
227
+ num_lookahead_slots: int
228
+
229
+ # Optimization for fast-access to seq_group lists
230
+ decode_seq_groups_list: List[SequenceGroup]
231
+ prefill_seq_groups_list: List[SequenceGroup]
232
+
233
+ @classmethod
234
+ def create_empty(cls) -> "SchedulerRunningOutputs":
235
+ return SchedulerRunningOutputs(
236
+ decode_seq_groups=[],
237
+ prefill_seq_groups=[],
238
+ preempted=[],
239
+ swapped_out=[],
240
+ blocks_to_swap_out=[],
241
+ blocks_to_copy=[],
242
+ num_lookahead_slots=0,
243
+ decode_seq_groups_list=[],
244
+ prefill_seq_groups_list=[],
245
+ )
246
+
247
+
248
+ @dataclass
249
+ class SchedulerSwappedInOutputs:
250
+ """The requests that are scheduled from a swap queue.
251
+
252
+ Could contain prefill (prefill that's chunked) or decodes.
253
+ """
254
+
255
+ # Selected sequences that are going to be swapped in and is in a
256
+ # decoding phase.
257
+ decode_seq_groups: List[ScheduledSequenceGroup]
258
+ # Selected sequences that are going to be swapped in and in a prefill
259
+ # phase. I.e., it means the prefill has been chunked.
260
+ prefill_seq_groups: List[ScheduledSequenceGroup]
261
+ # The blocks to swap in.
262
+ blocks_to_swap_in: List[Tuple[int, int]]
263
+ # The blocks to copy.
264
+ blocks_to_copy: List[Tuple[int, int]]
265
+ # The number of slots for lookahead decoding.
266
+ num_lookahead_slots: int
267
+ # Infeasible sequence groups.
268
+ infeasible_seq_groups: List[SequenceGroup]
269
+
270
+ @classmethod
271
+ def create_empty(cls) -> "SchedulerSwappedInOutputs":
272
+ return SchedulerSwappedInOutputs(
273
+ decode_seq_groups=[],
274
+ prefill_seq_groups=[],
275
+ blocks_to_swap_in=[],
276
+ blocks_to_copy=[],
277
+ num_lookahead_slots=0,
278
+ infeasible_seq_groups=[],
279
+ )
280
+
281
+
282
+ @dataclass
283
+ class SchedulerPrefillOutputs:
284
+ """The requests that are scheduled from a waiting queue.
285
+
286
+ Could contain a fresh prefill requests or preempted requests that need
287
+ to be recomputed from scratch.
288
+ """
289
+
290
+ # Selected sequences for prefill.
291
+ seq_groups: List[ScheduledSequenceGroup]
292
+ # Ignored sequence groups.
293
+ ignored_seq_groups: List[SequenceGroup]
294
+ num_lookahead_slots: int
295
+
296
+ @classmethod
297
+ def create_empty(cls) -> "SchedulerPrefillOutputs":
298
+ return SchedulerPrefillOutputs(
299
+ seq_groups=[],
300
+ ignored_seq_groups=[],
301
+ num_lookahead_slots=0,
302
+ )
303
+
304
+
305
+ def seq_group_metadata_builder():
306
+ return SequenceGroupMetadata(request_id="",
307
+ is_prompt=False,
308
+ seq_data={},
309
+ sampling_params=None,
310
+ block_tables={})
311
+
312
+
313
+ def scheduler_running_outputs_builder():
314
+ return SchedulerRunningOutputs(decode_seq_groups=[],
315
+ prefill_seq_groups=[],
316
+ preempted=[],
317
+ swapped_out=[],
318
+ blocks_to_swap_out=[],
319
+ blocks_to_copy=[],
320
+ num_lookahead_slots=0,
321
+ prefill_seq_groups_list=[],
322
+ decode_seq_groups_list=[])
323
+
324
+
325
+ def scheduled_seq_group_builder():
326
+ return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
327
+ token_chunk_size=0)
328
+ # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
329
+
330
+
331
+ @dataclass
332
+ class PartialPrefillMetadata:
333
+ """Holds information about the partial prefills that are currently running
334
+ during a single iteration of the Scheduler.
335
+ When chunked prefill is enabled, we allow a certain number of seqs to be
336
+ partially prefilled during each iteration. Having multiple partial prefills
337
+ in flight allows us to minimize TTFT and avoid decode starvation in cases
338
+ where a single sequence group with a very large prompt blocks the queue for
339
+ too many iterations.
340
+ The number of long prefill requests is limited so that smaller
341
+ requests may jump the queue in front of them and get to the decode
342
+ phase faster.
343
+ """
344
+
345
+ # A minimum bound on the total number of prefills to be scheduled during
346
+ # this iteration
347
+ schedulable_prefills: int
348
+
349
+ # The number of long prefill requests currently running
350
+ long_prefills: int
351
+
352
+ scheduler_config: SchedulerConfig
353
+
354
+ def can_schedule(self, seq_group: SequenceGroup) -> bool:
355
+ """When concurrent partial prefills are enabled,
356
+ we limit the number of long requests and only accept
357
+ shorter requests from the queue while running them
358
+ concurrently"""
359
+ return not (seq_group.first_seq.get_num_new_tokens()
360
+ > self.scheduler_config.long_prefill_token_threshold
361
+ and self.long_prefills
362
+ >= self.scheduler_config.max_long_partial_prefills
363
+ and self.scheduler_config.max_num_partial_prefills > 1)
364
+
365
+ def maybe_increment_partial_prefills(self,
366
+ seq_group: SequenceGroup) -> None:
367
+ # When a new prefill is scheduled, we need to know if it is a
368
+ # long request
369
+ if (seq_group.first_seq.get_num_new_tokens()
370
+ > self.scheduler_config.long_prefill_token_threshold):
371
+ self.long_prefills += 1
372
+
373
+ @classmethod
374
+ def from_queues(
375
+ cls,
376
+ running: Deque[SequenceGroup],
377
+ waiting: Deque[SequenceGroup],
378
+ scheduler_config: SchedulerConfig,
379
+ ) -> "PartialPrefillMetadata":
380
+ """Create a PartialPrefillMetadata object from the current state of
381
+ the scheduler's queues.
382
+ This accounts for the currently running prefill requests, and peeks into
383
+ the waiting queue to see if there are more prefills to potentially be
384
+ scheduled during this iteration."""
385
+ prefills = 0
386
+ long_prefills = 0
387
+
388
+ waiting_long_prefills = 0
389
+
390
+ for sg in running:
391
+ if sg.first_seq.data.stage == SequenceStage.PREFILL:
392
+ prefills += 1
393
+ if (sg.first_seq.get_num_new_tokens()
394
+ > scheduler_config.long_prefill_token_threshold):
395
+ long_prefills += 1
396
+
397
+ for sg in waiting:
398
+ # Don't bother looping through the rest of the queue if we know
399
+ # there are already at
400
+ # least max_partial_prefills requests to fill
401
+ if prefills >= scheduler_config.max_num_partial_prefills:
402
+ break
403
+
404
+ # Don't count long requests from the waiting queue if we aren't
405
+ # going to schedule them anyway
406
+ if (sg.first_seq.get_num_new_tokens()
407
+ > scheduler_config.long_prefill_token_threshold):
408
+ if (long_prefills + waiting_long_prefills
409
+ >= scheduler_config.max_long_partial_prefills):
410
+ continue
411
+ waiting_long_prefills += 1
412
+ prefills += 1
413
+
414
+ # NB: long_prefills and waiting_long_prefills are tracked separately.
415
+ # We don't account for the waiting requests here because we need to use
416
+ # this metadata to track how many have actually been scheduled.
417
+ return PartialPrefillMetadata(
418
+ schedulable_prefills=min(
419
+ prefills, scheduler_config.max_num_partial_prefills),
420
+ long_prefills=long_prefills,
421
+ scheduler_config=scheduler_config,
422
+ )
423
+
424
+
425
+ class Scheduler:
426
+
427
+ def __init__(
428
+ self,
429
+ scheduler_config: SchedulerConfig,
430
+ cache_config: CacheConfig,
431
+ lora_config: Optional[LoRAConfig],
432
+ pipeline_parallel_size: int = 1,
433
+ output_proc_callback: Optional[Callable] = None,
434
+ ) -> None:
435
+ self.scheduler_config = scheduler_config
436
+ self.cache_config = cache_config
437
+ # Note for LoRA scheduling: the current policy is extremely
438
+ # simple and NOT fair. It can lead to starvation of some
439
+ # LoRAs. This should be improved in the future.
440
+ self.lora_config = lora_config
441
+
442
+ version = "selfattn"
443
+ if (self.scheduler_config.runner_type == "pooling"
444
+ or self.cache_config.is_attention_free):
445
+ version = "placeholder"
446
+
447
+ BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
448
+ version)
449
+
450
+ num_gpu_blocks = cache_config.num_gpu_blocks
451
+ if num_gpu_blocks:
452
+ num_gpu_blocks //= pipeline_parallel_size
453
+
454
+ num_cpu_blocks = cache_config.num_cpu_blocks
455
+ if num_cpu_blocks:
456
+ num_cpu_blocks //= pipeline_parallel_size
457
+
458
+ # Create the block space manager.
459
+ self.block_manager = BlockSpaceManagerImpl(
460
+ block_size=self.cache_config.block_size,
461
+ num_gpu_blocks=num_gpu_blocks,
462
+ num_cpu_blocks=num_cpu_blocks,
463
+ sliding_window=self.cache_config.sliding_window,
464
+ enable_caching=self.cache_config.enable_prefix_caching,
465
+ )
466
+
467
+ # Sequence groups in the WAITING state.
468
+ # Contain new prefill or preempted requests.
469
+ self.waiting: Deque[SequenceGroup] = deque()
470
+ # Sequence groups in the RUNNING state.
471
+ # Contain decode requests.
472
+ self.running: Deque[SequenceGroup] = deque()
473
+ # Sequence groups in the SWAPPED state.
474
+ # Contain decode requests that are swapped out.
475
+ self.swapped: Deque[SequenceGroup] = deque()
476
+ # Sequence groups finished requests ids since last step iteration.
477
+ # It lets the model know that any state associated with these requests
478
+ # can and must be released after the current step.
479
+ # This is used to evict the finished requests from the Mamba cache.
480
+ self._finished_requests_ids: List[str] = list()
481
+ # Time at previous scheduling step
482
+ self.prev_time = 0.0
483
+ # Did we schedule a prompt at previous step?
484
+ self.prev_prompt = False
485
+ # Latency of the last prompt step
486
+ self.last_prompt_latency = 0.0
487
+ # preemption mode, RECOMPUTE or SWAP
488
+ self.user_specified_preemption_mode = scheduler_config.preemption_mode
489
+
490
+ # The following field is test-only. It is used to inject artificial
491
+ # preemption.
492
+ self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
493
+ self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
494
+ if self.enable_artificial_preemption
495
+ else 0)
496
+ self.num_cumulative_preemption: int = 0
497
+
498
+ # Used to cache python objects
499
+ self._seq_group_metadata_cache: List[PyObjectCache] = []
500
+ self._scheduler_running_outputs_cache: List[PyObjectCache] = []
501
+ self._scheduled_seq_group_cache: List[PyObjectCache] = []
502
+
503
+ # For async output processing, we need to swap cache buffers between
504
+ # iterations. I.e. since the output processing is lagged one step,
505
+ # we cannot reuse the cached objects immediately when the schedule()
506
+ # is called again, but only when schedule() is called the second time.
507
+ self.output_proc_callback = output_proc_callback
508
+ self.use_async_output_proc = self.output_proc_callback is not None
509
+ self.num_cache_iters = 2 if self.use_async_output_proc else 1
510
+
511
+ self.cache_id = 0
512
+ for i in range(self.num_cache_iters):
513
+ self._seq_group_metadata_cache.append(
514
+ PyObjectCache(seq_group_metadata_builder))
515
+ self._scheduler_running_outputs_cache.append(
516
+ PyObjectCache(scheduler_running_outputs_builder))
517
+ self._scheduled_seq_group_cache.append(
518
+ PyObjectCache(scheduled_seq_group_builder))
519
+
520
+ # For async postprocessor, the extra decode run cannot be done
521
+ # when the request reaches max_model_len. In this case, the request
522
+ # will be stopped during schedule() call and added to this stop list
523
+ # for processing and deallocation by the free_finished_seq_groups()
524
+ self._async_stopped: List[SequenceGroup] = []
525
+
526
+ # List with the chunk sizes to hand out to each sequence depending
527
+ # on how many partial prefills are running. This is slightly faster than
528
+ # running an integer division every time a prefill is scheduled.
529
+ # This splits the budget evenly among all prefills.
530
+ self.partial_prefill_budget_lookup_list = [0] * (
531
+ self.scheduler_config.max_num_partial_prefills + 1)
532
+ self.partial_prefill_budget_lookup_list[0] = (
533
+ scheduler_config.max_num_batched_tokens)
534
+ for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
535
+ self.partial_prefill_budget_lookup_list[i] = (
536
+ scheduler_config.max_num_batched_tokens // i)
537
+
538
+ @property
539
+ def next_cache_id(self):
540
+ return (self.cache_id + 1) % self.num_cache_iters
541
+
542
+ @property
543
+ def lora_enabled(self) -> bool:
544
+ return bool(self.lora_config)
545
+
546
+ @property
547
+ def num_decoding_tokens_per_seq(self) -> int:
548
+ """The number of new tokens."""
549
+ return 1
550
+
551
+ def add_seq_group(self, seq_group: SequenceGroup) -> None:
552
+ # Add sequence groups to the waiting queue.
553
+ self.waiting.append(seq_group)
554
+
555
+ def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
556
+ # Add sequence groups to the running queue.
557
+ # Only for testing purposes.
558
+ self.running.append(seq_group)
559
+
560
+ def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
561
+ # Add sequence groups to the swapped queue.
562
+ # Only for testing purposes.
563
+ self.swapped.append(seq_group)
564
+
565
+ def abort_seq_group(
566
+ self,
567
+ request_id: Union[str, Iterable[str]],
568
+ seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
569
+ ) -> None:
570
+ """Aborts a sequence group with the given ID.
571
+
572
+ Check if the sequence group with the given ID
573
+ is present in any of the state queue.
574
+ If present, remove the sequence group from the state queue.
575
+ Also, if any of the sequences in the sequence group is not finished,
576
+ free the sequence with status `FINISHED_ABORTED`.
577
+ Otherwise, do nothing.
578
+
579
+ Args:
580
+ request_id: The ID(s) of the sequence group to abort.
581
+ seq_id_to_seq_group: helper for groups with n>1
582
+ """
583
+ if isinstance(request_id, str):
584
+ request_id = (request_id, )
585
+ request_ids = set(request_id)
586
+ seq_id_to_seq_group = seq_id_to_seq_group or {}
587
+ for state_queue in [self.waiting, self.running, self.swapped]:
588
+ aborted_groups: List[SequenceGroup] = []
589
+ for seq_group in state_queue:
590
+ # When n>1, seq_group.request_id looks like
591
+ # foo_parallel_sample_0, while request_ids is just foo, and we
592
+ # should resolve it as real_request_id to match.
593
+ if seq_group.request_id in seq_id_to_seq_group:
594
+ real_request_id = seq_id_to_seq_group[
595
+ seq_group.request_id].group_id
596
+ else:
597
+ real_request_id = seq_group.request_id
598
+ if real_request_id in request_ids:
599
+ # Appending aborted group into pending list.
600
+ aborted_groups.append(seq_group)
601
+ # We can't remove real_request_id in request_ids here,
602
+ # because there may be other seq groups sharing the same
603
+ # real_request_id
604
+ for aborted_group in aborted_groups:
605
+ # Remove the sequence group from the state queue.
606
+ state_queue.remove(aborted_group)
607
+ # Remove the aborted request from the Mamba cache.
608
+ self._finished_requests_ids.append(aborted_group.request_id)
609
+ for seq in aborted_group.get_seqs():
610
+ if seq.is_finished():
611
+ continue
612
+ seq.status = SequenceStatus.FINISHED_ABORTED
613
+ self.free_seq(seq)
614
+ if aborted_group.request_id in seq_id_to_seq_group:
615
+ del seq_id_to_seq_group[aborted_group.request_id]
616
+
617
+ self._free_seq_group_cross_attn_blocks(aborted_group)
618
+
619
+ def _free_seq_group_cross_attn_blocks(
620
+ self,
621
+ seq_group: SequenceGroup,
622
+ ) -> None:
623
+ """
624
+ Free a sequence group from a cross-attention block table.
625
+ Has no effect on decoder-only models.
626
+ """
627
+ if seq_group.is_encoder_decoder():
628
+ self.block_manager.free_cross(seq_group)
629
+
630
+ def has_unfinished_seqs(self) -> bool:
631
+ return (len(self.waiting) != 0 or len(self.running) != 0
632
+ or len(self.swapped) != 0)
633
+
634
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
635
+ return self.block_manager.get_prefix_cache_hit_rate(device)
636
+
637
+ def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
638
+ return self.block_manager.reset_prefix_cache(device)
639
+
640
+ def get_num_unfinished_seq_groups(self) -> int:
641
+ return len(self.waiting) + len(self.running) + len(self.swapped)
642
+
643
+ def get_and_reset_finished_requests_ids(self) -> List[str]:
644
+ """Flushes the list of request ids of previously finished seq_groups."""
645
+ finished_requests_ids = self._finished_requests_ids
646
+ self._finished_requests_ids = list()
647
+ return finished_requests_ids
648
+
649
+ def _schedule_running(
650
+ self,
651
+ budget: SchedulingBudget,
652
+ curr_loras: Optional[Set[int]],
653
+ enable_chunking: bool = False,
654
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
655
+ ) -> SchedulerRunningOutputs:
656
+ """Schedule sequence groups that are running.
657
+
658
+ Running queue should include decode and chunked prefill requests.
659
+
660
+ Args:
661
+ budget: The scheduling budget. The argument is in-place updated
662
+ when any decodes are preempted.
663
+ curr_loras: Currently batched lora request ids. The argument is
664
+ in-place updated when any decodes are preempted.
665
+ enable_chunking: If True, seq group can be chunked and only a
666
+ chunked number of tokens are scheduled if
667
+ `budget.num_batched_tokens` has not enough capacity to schedule
668
+ all tokens.
669
+ partial_prefill_metadata: information about the partial prefills
670
+ that are currently running
671
+
672
+ Returns:
673
+ SchedulerRunningOutputs.
674
+ """
675
+ ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
676
+ self.cache_id].get_object()
677
+ ret.blocks_to_swap_out.clear()
678
+ ret.blocks_to_copy.clear()
679
+ ret.decode_seq_groups.clear()
680
+ ret.prefill_seq_groups.clear()
681
+ ret.preempted.clear()
682
+ ret.swapped_out.clear()
683
+
684
+ ret.num_lookahead_slots = self._get_num_lookahead_slots(
685
+ is_prefill=False, enable_chunking=enable_chunking)
686
+
687
+ ret.decode_seq_groups_list.clear()
688
+ ret.prefill_seq_groups_list.clear()
689
+
690
+ # Blocks that need to be swapped or copied before model execution.
691
+ blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
692
+ blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
693
+
694
+ decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
695
+ prefill_seq_groups: List[
696
+ ScheduledSequenceGroup] = ret.prefill_seq_groups
697
+ preempted: List[SequenceGroup] = ret.preempted
698
+ swapped_out: List[SequenceGroup] = ret.swapped_out
699
+
700
+ running_queue = self.running
701
+ assert len(self._async_stopped) == 0
702
+ while running_queue:
703
+ seq_group = running_queue[0]
704
+ # We discard the cached tokens info here because we don't need it
705
+ # for running sequence:
706
+ # 1. If a sequence is running with chunked prefill, the cached
707
+ # tokens info was already used for the first prefill.
708
+ # 2. If a sequence is running with non-chunked prefill, then
709
+ # there it's a decoding sequence, and the cached tokens info is
710
+ # irrelevant.
711
+ num_uncached_new_tokens, _ = \
712
+ self._get_num_new_uncached_and_cached_tokens(
713
+ seq_group,
714
+ SequenceStatus.RUNNING,
715
+ enable_chunking,
716
+ budget,
717
+ partial_prefill_metadata,
718
+ )
719
+
720
+ num_running_tokens = num_uncached_new_tokens
721
+ if num_running_tokens == 0:
722
+ # No budget => Stop
723
+ break
724
+
725
+ running_queue.popleft()
726
+
727
+ # With async postprocessor, an extra decode run is done
728
+ # to process the final tokens. The check below avoids this extra
729
+ # decode run when the model max len is reached, in order to avoid
730
+ # a memory overflow.
731
+ if (self.use_async_output_proc and seq_group.seqs[0].get_len()
732
+ > self.scheduler_config.max_model_len):
733
+ self._async_stopped.append(seq_group)
734
+ continue
735
+
736
+ # NOTE(woosuk): Preemption happens only when there is no available
737
+ # slot to keep all the sequence groups in the RUNNING state.
738
+ while not self._can_append_slots(seq_group, enable_chunking):
739
+ budget.subtract_num_batched_tokens(seq_group.request_id,
740
+ num_running_tokens)
741
+ num_running_seqs = seq_group.get_max_num_running_seqs()
742
+ budget.subtract_num_seqs(seq_group.request_id,
743
+ num_running_seqs)
744
+
745
+ if (curr_loras is not None and seq_group.lora_int_id > 0
746
+ and seq_group.lora_int_id in curr_loras):
747
+ curr_loras.remove(seq_group.lora_int_id)
748
+
749
+ # Determine victim sequence
750
+ cont_loop = True
751
+ if running_queue:
752
+ # Preempt the lowest-priority sequence group.
753
+ victim_seq_group = running_queue.pop()
754
+ else:
755
+ # No other sequence group can be preempted.
756
+ # Preempt the current sequence group.
757
+ # Note: This is also where we stop this loop
758
+ # (since there is nothing else to preempt)
759
+ victim_seq_group = seq_group
760
+ cont_loop = False
761
+
762
+ # With async postprocessor, before preempting a sequence
763
+ # we need to ensure it has no pending async postprocessor
764
+ do_preempt = True
765
+ if self.use_async_output_proc:
766
+ assert self.output_proc_callback is not None
767
+ self.output_proc_callback(
768
+ request_id=victim_seq_group.request_id)
769
+
770
+ # It may be that the async pending "victim_seq_group"
771
+ # becomes finished, in which case we simply free it.
772
+ if victim_seq_group.is_finished():
773
+ self._free_finished_seq_group(victim_seq_group)
774
+ do_preempt = False
775
+
776
+ # Do preemption
777
+ if do_preempt:
778
+ preempted_mode = self._preempt(victim_seq_group,
779
+ blocks_to_swap_out)
780
+ if preempted_mode == PreemptionMode.RECOMPUTE:
781
+ preempted.append(victim_seq_group)
782
+ else:
783
+ swapped_out.append(victim_seq_group)
784
+
785
+ if not cont_loop:
786
+ break
787
+ else:
788
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
789
+ is_prefill = seq_group.is_prefill()
790
+
791
+ scheduled_seq_group: ScheduledSequenceGroup = (
792
+ self._scheduled_seq_group_cache[
793
+ self.cache_id].get_object())
794
+ scheduled_seq_group.seq_group = seq_group
795
+ if is_prefill:
796
+ scheduled_seq_group.token_chunk_size = num_running_tokens
797
+ prefill_seq_groups.append(scheduled_seq_group)
798
+ ret.prefill_seq_groups_list.append(seq_group)
799
+ else:
800
+ scheduled_seq_group.token_chunk_size = 1
801
+ decode_seq_groups.append(scheduled_seq_group)
802
+ ret.decode_seq_groups_list.append(seq_group)
803
+
804
+ budget.add_num_batched_tokens(seq_group.request_id,
805
+ num_running_tokens)
806
+ # OPTIMIZATION: Note that get_max_num_running_seqs is
807
+ # expensive. For the default scheduling chase where
808
+ # enable_chunking is False, num_seqs are updated before running
809
+ # this method, so we don't have to update it again here.
810
+ if enable_chunking:
811
+ num_running_seqs = seq_group.get_max_num_running_seqs()
812
+ budget.add_num_seqs(seq_group.request_id, num_running_seqs)
813
+ if curr_loras is not None and seq_group.lora_int_id > 0:
814
+ curr_loras.add(seq_group.lora_int_id)
815
+
816
+ self._scheduler_running_outputs_cache[self.next_cache_id].reset()
817
+ self._scheduled_seq_group_cache[self.next_cache_id].reset()
818
+
819
+ return ret
820
+
821
+ def _schedule_swapped(
822
+ self,
823
+ budget: SchedulingBudget,
824
+ curr_loras: Optional[Set[int]],
825
+ enable_chunking: bool = False,
826
+ ) -> SchedulerSwappedInOutputs:
827
+ """Schedule sequence groups that are swapped out.
828
+
829
+ It schedules swapped requests as long as it fits `budget` and
830
+ curr_loras <= max_lora from the scheduling config. The input arguments
831
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
832
+
833
+ Args:
834
+ budget: The scheduling budget. The argument is in-place updated
835
+ when any requests are swapped in.
836
+ curr_loras: Currently batched lora request ids. The argument is
837
+ in-place updated when any requests are swapped in.
838
+ enable_chunking: If True, seq group can be chunked and only a
839
+ chunked number of tokens are scheduled if
840
+ `budget.num_batched_tokens` has not enough capacity to schedule
841
+ all tokens.
842
+
843
+ Returns:
844
+ SchedulerSwappedInOutputs.
845
+ """
846
+ # Blocks that need to be swapped or copied before model execution.
847
+ blocks_to_swap_in: List[Tuple[int, int]] = []
848
+ blocks_to_copy: List[Tuple[int, int]] = []
849
+ decode_seq_groups: List[ScheduledSequenceGroup] = []
850
+ prefill_seq_groups: List[ScheduledSequenceGroup] = []
851
+ infeasible_seq_groups: List[SequenceGroup] = []
852
+
853
+ swapped_queue = self.swapped
854
+
855
+ leftover_swapped: Deque[SequenceGroup] = deque()
856
+ while swapped_queue:
857
+ seq_group = swapped_queue[0]
858
+
859
+ # If the sequence group cannot be swapped in, stop.
860
+ is_prefill = seq_group.is_prefill()
861
+ alloc_status = self.block_manager.can_swap_in(
862
+ seq_group,
863
+ self._get_num_lookahead_slots(is_prefill, enable_chunking))
864
+ if alloc_status == AllocStatus.LATER:
865
+ break
866
+ elif alloc_status == AllocStatus.NEVER:
867
+ logger.warning(
868
+ "Failing the request %s because there's not enough kv "
869
+ "cache blocks to run the entire sequence.",
870
+ seq_group.request_id,
871
+ )
872
+ for seq in seq_group.get_seqs():
873
+ seq.status = SequenceStatus.FINISHED_IGNORED
874
+ infeasible_seq_groups.append(seq_group)
875
+ swapped_queue.popleft()
876
+ continue
877
+
878
+ lora_int_id = 0
879
+ if self.lora_enabled:
880
+ lora_int_id = seq_group.lora_int_id
881
+ assert curr_loras is not None
882
+ assert self.lora_config is not None
883
+ if (lora_int_id > 0 and (lora_int_id not in curr_loras)
884
+ and len(curr_loras) >= self.lora_config.max_loras):
885
+ # We don't have a space for another LoRA, so
886
+ # we ignore this request for now.
887
+ leftover_swapped.appendleft(seq_group)
888
+ swapped_queue.popleft()
889
+ continue
890
+
891
+ # The total number of sequences in the RUNNING state should not
892
+ # exceed the maximum number of sequences.
893
+ num_new_seqs = seq_group.get_max_num_running_seqs()
894
+ num_new_tokens_uncached, num_new_tokens_cached = (
895
+ self._get_num_new_uncached_and_cached_tokens(
896
+ seq_group, SequenceStatus.SWAPPED, enable_chunking,
897
+ budget))
898
+
899
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
900
+ num_new_tokens=num_new_tokens_uncached,
901
+ num_new_seqs=num_new_seqs,
902
+ ):
903
+ break
904
+
905
+ if lora_int_id > 0 and curr_loras is not None:
906
+ curr_loras.add(lora_int_id)
907
+ swapped_queue.popleft()
908
+ self._swap_in(seq_group, blocks_to_swap_in)
909
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
910
+ if is_prefill:
911
+ prefill_seq_groups.append(
912
+ ScheduledSequenceGroup(
913
+ seq_group,
914
+ token_chunk_size=num_new_tokens_uncached +
915
+ num_new_tokens_cached,
916
+ ))
917
+ else:
918
+ decode_seq_groups.append(
919
+ ScheduledSequenceGroup(seq_group, token_chunk_size=1))
920
+ budget.add_num_batched_tokens(
921
+ seq_group.request_id,
922
+ num_batched_tokens=num_new_tokens_uncached,
923
+ num_cached_tokens=num_new_tokens_cached,
924
+ )
925
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
926
+
927
+ swapped_queue.extendleft(leftover_swapped)
928
+
929
+ return SchedulerSwappedInOutputs(
930
+ decode_seq_groups=decode_seq_groups,
931
+ prefill_seq_groups=prefill_seq_groups,
932
+ blocks_to_swap_in=blocks_to_swap_in,
933
+ blocks_to_copy=blocks_to_copy,
934
+ num_lookahead_slots=self._get_num_lookahead_slots(
935
+ is_prefill=False, enable_chunking=enable_chunking),
936
+ infeasible_seq_groups=infeasible_seq_groups,
937
+ )
938
+
939
+ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
940
+ if (self.scheduler_config.chunked_prefill_enabled
941
+ and not self.scheduler_config.is_multi_step):
942
+ prompt_limit = self.scheduler_config.max_model_len
943
+ else:
944
+ prompt_limit = min(
945
+ self.scheduler_config.max_model_len,
946
+ self.scheduler_config.max_num_batched_tokens,
947
+ )
948
+
949
+ # Model is fine tuned with long context. Return the fine tuned max_len.
950
+ if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
951
+ assert prompt_limit <= seq_group.lora_request.long_lora_max_len
952
+ return seq_group.lora_request.long_lora_max_len
953
+ else:
954
+ return prompt_limit
955
+
956
+ def _get_priority(self,
957
+ seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
958
+ """Get the priority of the sequence group.
959
+ Highest preference to user-defined priority, followed by arrival time.
960
+ Args:
961
+ seq_group: The sequence group input.
962
+ Returns:
963
+ The priority of the sequence group.
964
+ """
965
+ return seq_group.priority, seq_group.arrival_time
966
+
967
+ def _schedule_priority_preemption(
968
+ self,
969
+ budget: SchedulingBudget,
970
+ ) -> int:
971
+ """Sorts waiting and running queue. Also, force preempt requests
972
+ from the running queue if their priority is lower.
973
+ Priority-based preemption is used with the priority policy.
974
+ Args:
975
+ budget: The scheduling budget. The argument is in-place updated
976
+ when any requests are scheduled.
977
+ Returns:
978
+ A count of priority-based preemptions.
979
+ """
980
+
981
+ waiting_queue = self.waiting
982
+
983
+ running_queue = deque(sorted(self.running, key=self._get_priority))
984
+
985
+ blocks_to_swap_out: List[Tuple[int, int]] = []
986
+ force_preemption_count = 0
987
+
988
+ if waiting_queue:
989
+ seq_group = waiting_queue.popleft()
990
+ num_new_seqs = seq_group.get_max_num_running_seqs()
991
+ num_new_tokens_uncached, _ = \
992
+ self._get_num_new_uncached_and_cached_tokens(
993
+ seq_group, SequenceStatus.WAITING, False, budget)
994
+
995
+ # Only preempt if priority inversion exists
996
+ while running_queue and self._get_priority(
997
+ running_queue[-1]) > self._get_priority(seq_group):
998
+ # Only preempt if waiting sequence cannot be allocated
999
+ can_allocate = self.block_manager.can_allocate(seq_group)
1000
+ if (num_new_tokens_uncached > 0
1001
+ and can_allocate == AllocStatus.OK
1002
+ and budget.can_schedule(
1003
+ num_new_tokens=num_new_tokens_uncached,
1004
+ num_new_seqs=num_new_seqs,
1005
+ )):
1006
+ break
1007
+
1008
+ # Adjust budget to remove the victim sequence group
1009
+ vseq_group = running_queue.pop()
1010
+ num_running_tokens_uncached, _ = (
1011
+ self._get_num_new_uncached_and_cached_tokens(
1012
+ vseq_group, SequenceStatus.RUNNING, False, budget))
1013
+ budget.subtract_num_batched_tokens(
1014
+ vseq_group.request_id, num_running_tokens_uncached)
1015
+ num_running_seqs = vseq_group.get_max_num_running_seqs()
1016
+ budget.subtract_num_seqs(vseq_group.request_id,
1017
+ num_running_seqs)
1018
+
1019
+ # Preempt out the victim sequence group
1020
+ self._preempt(vseq_group, blocks_to_swap_out)
1021
+ waiting_queue.appendleft(vseq_group)
1022
+ force_preemption_count += 1
1023
+ # Put the sequence back into the waiting queue
1024
+ waiting_queue.appendleft(seq_group)
1025
+
1026
+ waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
1027
+
1028
+ self.waiting = waiting_queue
1029
+ self.running = running_queue
1030
+ return force_preemption_count
1031
+
1032
+ def _schedule_prefills(
1033
+ self,
1034
+ budget: SchedulingBudget,
1035
+ curr_loras: Optional[Set[int]],
1036
+ enable_chunking: bool = False,
1037
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1038
+ ) -> SchedulerPrefillOutputs:
1039
+ """Schedule sequence groups that are in prefill stage.
1040
+
1041
+ Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
1042
+ as a new prefill (that starts from beginning -> most recently generated
1043
+ tokens).
1044
+
1045
+ It schedules waiting requests as long as it fits `budget` and
1046
+ curr_loras <= max_lora from the scheduling config. The input arguments
1047
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
1048
+
1049
+ Args:
1050
+ budget: The scheduling budget. The argument is in-place updated
1051
+ when any requests are scheduled.
1052
+ curr_loras: Currently batched lora request ids. The argument is
1053
+ in-place updated when any requests are scheduled.
1054
+ enable_chunking: If True, seq group can be chunked and only a
1055
+ chunked number of tokens are scheduled if
1056
+ `budget.num_batched_tokens` has not enough capacity to schedule
1057
+ all tokens.
1058
+ partial_prefill_metadata: information about the partial prefills
1059
+ that are currently running
1060
+
1061
+ Returns:
1062
+ SchedulerPrefillOutputs.
1063
+ """
1064
+ if budget.remaining_token_budget() == 0:
1065
+ # Do nothing: Can't add any more prefill anyway
1066
+ return SchedulerPrefillOutputs(
1067
+ seq_groups=[],
1068
+ ignored_seq_groups=[],
1069
+ num_lookahead_slots=self._get_num_lookahead_slots(
1070
+ is_prefill=True, enable_chunking=enable_chunking),
1071
+ )
1072
+ ignored_seq_groups: List[SequenceGroup] = []
1073
+ seq_groups: List[ScheduledSequenceGroup] = []
1074
+ using_prompt_embeds: bool = False
1075
+
1076
+ waiting_queue = self.waiting
1077
+
1078
+ leftover_waiting_sequences: Deque[SequenceGroup] = deque()
1079
+ while self._passed_delay(time.time()) and waiting_queue:
1080
+ seq_group = waiting_queue[0]
1081
+
1082
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
1083
+ assert len(waiting_seqs) == 1, (
1084
+ "Waiting sequence group should have only one prompt "
1085
+ "sequence.")
1086
+ if (partial_prefill_metadata is not None
1087
+ and not partial_prefill_metadata.can_schedule(seq_group)):
1088
+ leftover_waiting_sequences.appendleft(seq_group)
1089
+ waiting_queue.popleft()
1090
+ continue
1091
+ num_new_tokens_uncached, num_new_tokens_cached = (
1092
+ self._get_num_new_uncached_and_cached_tokens(
1093
+ seq_group,
1094
+ SequenceStatus.WAITING,
1095
+ enable_chunking,
1096
+ budget,
1097
+ partial_prefill_metadata=partial_prefill_metadata,
1098
+ ))
1099
+ num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
1100
+
1101
+ if not enable_chunking:
1102
+ num_prompt_tokens = waiting_seqs[0].get_len()
1103
+ assert num_new_tokens == num_prompt_tokens
1104
+
1105
+ prompt_limit = self._get_prompt_limit(seq_group)
1106
+ if num_new_tokens > prompt_limit:
1107
+ logger.warning(
1108
+ "Input prompt (%d tokens) is too long"
1109
+ " and exceeds limit of %d",
1110
+ num_new_tokens,
1111
+ prompt_limit,
1112
+ )
1113
+ for seq in waiting_seqs:
1114
+ seq.status = SequenceStatus.FINISHED_IGNORED
1115
+ ignored_seq_groups.append(seq_group)
1116
+ waiting_queue.popleft()
1117
+ continue
1118
+
1119
+ num_lookahead_slots: int = 0
1120
+ if self.scheduler_config.is_multi_step and enable_chunking:
1121
+ num_lookahead_slots = self._get_num_lookahead_slots(
1122
+ True, enable_chunking)
1123
+
1124
+ # If the sequence group cannot be allocated, stop.
1125
+ can_allocate = self.block_manager.can_allocate(
1126
+ seq_group, num_lookahead_slots=num_lookahead_slots)
1127
+ if can_allocate == AllocStatus.LATER:
1128
+ break
1129
+ elif can_allocate == AllocStatus.NEVER:
1130
+ logger.warning(
1131
+ "Input prompt (%d tokens) + lookahead slots (%d) is "
1132
+ "too long and exceeds the capacity of block_manager",
1133
+ num_new_tokens,
1134
+ num_lookahead_slots,
1135
+ )
1136
+ for seq in waiting_seqs:
1137
+ seq.status = SequenceStatus.FINISHED_IGNORED
1138
+ ignored_seq_groups.append(seq_group)
1139
+ waiting_queue.popleft()
1140
+ continue
1141
+
1142
+ # We cannot mix sequence groups that use prompt embeds and
1143
+ # those that do not.
1144
+ if len(seq_groups) == 0:
1145
+ using_prompt_embeds = seq_group.uses_prompt_embeds()
1146
+ if using_prompt_embeds != seq_group.uses_prompt_embeds():
1147
+ leftover_waiting_sequences.appendleft(seq_group)
1148
+ waiting_queue.popleft()
1149
+ continue
1150
+
1151
+ lora_int_id = 0
1152
+ if self.lora_enabled:
1153
+ lora_int_id = seq_group.lora_int_id
1154
+ assert curr_loras is not None
1155
+ assert self.lora_config is not None
1156
+ if (self.lora_enabled and lora_int_id > 0
1157
+ and lora_int_id not in curr_loras
1158
+ and len(curr_loras) >= self.lora_config.max_loras):
1159
+ # We don't have a space for another LoRA, so
1160
+ # we ignore this request for now.
1161
+ leftover_waiting_sequences.appendleft(seq_group)
1162
+ waiting_queue.popleft()
1163
+ continue
1164
+
1165
+ if (budget.num_batched_tokens
1166
+ >= self.scheduler_config.max_num_batched_tokens):
1167
+ # We've reached the budget limit - since there might be
1168
+ # continuous prefills in the running queue, we should break
1169
+ # to avoid scheduling any new prefills.
1170
+ break
1171
+
1172
+ num_new_seqs = seq_group.get_max_num_running_seqs()
1173
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
1174
+ num_new_tokens=num_new_tokens_uncached,
1175
+ num_new_seqs=num_new_seqs,
1176
+ ):
1177
+ break
1178
+
1179
+ # Can schedule this request.
1180
+ if curr_loras is not None and lora_int_id > 0:
1181
+ curr_loras.add(lora_int_id)
1182
+ waiting_queue.popleft()
1183
+ self._allocate_and_set_running(seq_group)
1184
+
1185
+ if partial_prefill_metadata is not None:
1186
+ partial_prefill_metadata.maybe_increment_partial_prefills(
1187
+ seq_group)
1188
+
1189
+ if enable_chunking and self.scheduler_config.is_multi_step:
1190
+ blocks_to_copy: List[Tuple[int, int]] = []
1191
+ # init_multi_step_from_lookahead_slots happens in append_slots
1192
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
1193
+ # This assert will trip when a copy-on-write happens. This is
1194
+ # not a concern as the very first sequence-group block
1195
+ # allocation happens above. Still, we have the assert to
1196
+ # catch any edge-cases.
1197
+ assert not blocks_to_copy
1198
+ else:
1199
+ seq_group.init_multi_step_from_lookahead_slots(
1200
+ num_lookahead_slots,
1201
+ num_scheduler_steps=self.scheduler_config.
1202
+ num_scheduler_steps,
1203
+ is_multi_step=self.scheduler_config.is_multi_step,
1204
+ enable_chunking=enable_chunking,
1205
+ )
1206
+
1207
+ seq_groups.append(
1208
+ ScheduledSequenceGroup(seq_group=seq_group,
1209
+ token_chunk_size=num_new_tokens))
1210
+ budget.add_num_batched_tokens(
1211
+ seq_group.request_id,
1212
+ num_batched_tokens=num_new_tokens_uncached,
1213
+ num_cached_tokens=num_new_tokens_cached,
1214
+ )
1215
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
1216
+
1217
+ # Queue requests that couldn't be scheduled.
1218
+ waiting_queue.extendleft(leftover_waiting_sequences)
1219
+ if len(seq_groups) > 0:
1220
+ self.prev_prompt = True
1221
+
1222
+ return SchedulerPrefillOutputs(
1223
+ seq_groups=seq_groups,
1224
+ ignored_seq_groups=ignored_seq_groups,
1225
+ num_lookahead_slots=self._get_num_lookahead_slots(
1226
+ is_prefill=True, enable_chunking=enable_chunking),
1227
+ )
1228
+
1229
+ def _schedule_default(self) -> SchedulerOutputs:
1230
+ """Schedule queued requests.
1231
+
1232
+ The current policy is designed to optimize the throughput. First,
1233
+ it batches as many prefill requests as possible. And it schedules
1234
+ decodes. If there's a pressure on GPU memory, decode requests can
1235
+ be swapped or preempted.
1236
+ """
1237
+ # Include running requests to the budget.
1238
+ budget = SchedulingBudget(
1239
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1240
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1241
+ )
1242
+ # Make sure we include num running seqs before scheduling prefill,
1243
+ # so that we don't schedule beyond max_num_seqs for prefill.
1244
+ for seq_group in self.running:
1245
+ budget.add_num_seqs(seq_group.request_id,
1246
+ seq_group.get_max_num_running_seqs())
1247
+ curr_loras = (set(
1248
+ seq_group.lora_int_id for seq_group in self.running
1249
+ if seq_group.lora_int_id > 0) if self.lora_enabled else None)
1250
+
1251
+ prefills = SchedulerPrefillOutputs.create_empty()
1252
+ running_scheduled = SchedulerRunningOutputs.create_empty()
1253
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1254
+
1255
+ # If any requests are swapped, prioritized swapped requests.
1256
+ if not self.swapped:
1257
+ prefills = self._schedule_prefills(budget,
1258
+ curr_loras,
1259
+ enable_chunking=False)
1260
+
1261
+ if len(prefills.seq_groups
1262
+ ) == 0 and self.scheduler_config.policy == "priority":
1263
+ self._schedule_priority_preemption(budget)
1264
+
1265
+ # Don't schedule decodes if prefills are scheduled.
1266
+ # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
1267
+ # only contains decode requests, not chunked prefills.
1268
+ if len(prefills.seq_groups) == 0:
1269
+ running_scheduled = self._schedule_running(budget,
1270
+ curr_loras,
1271
+ enable_chunking=False)
1272
+
1273
+ # If any sequence group is preempted, do not swap in any sequence
1274
+ # group. because it means there's no slot for new running requests.
1275
+ if (len(running_scheduled.preempted) +
1276
+ len(running_scheduled.swapped_out) == 0):
1277
+ swapped_in = \
1278
+ self._schedule_swapped(budget, curr_loras)
1279
+
1280
+ assert (budget.num_batched_tokens
1281
+ <= self.scheduler_config.max_num_batched_tokens)
1282
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1283
+
1284
+ # Update waiting requests.
1285
+ self.waiting.extendleft(running_scheduled.preempted)
1286
+ # Update new running requests.
1287
+ if len(prefills.seq_groups) > 0:
1288
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1289
+
1290
+ self.running.extend(running_scheduled.decode_seq_groups_list)
1291
+
1292
+ if len(swapped_in.decode_seq_groups) > 0:
1293
+ self.running.extend(
1294
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1295
+
1296
+ # Update swapped requests.
1297
+ self.swapped.extend(running_scheduled.swapped_out)
1298
+ preempted = len(running_scheduled.preempted) + len(
1299
+ running_scheduled.swapped_out)
1300
+
1301
+ # There should be no prefill from running queue because this policy
1302
+ # doesn't allow chunked prefills.
1303
+ assert len(running_scheduled.prefill_seq_groups) == 0
1304
+ assert len(swapped_in.prefill_seq_groups) == 0
1305
+
1306
+ # Merge lists
1307
+ num_prefill_groups = len(prefills.seq_groups)
1308
+ ignored_seq_groups_for_embeds = list[SequenceGroup]()
1309
+ if num_prefill_groups > 0:
1310
+ scheduled_seq_groups = prefills.seq_groups
1311
+ scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
1312
+ ignored_seq_groups_for_embeds.clear()
1313
+ else:
1314
+ scheduled_seq_groups = running_scheduled.decode_seq_groups
1315
+ if len(scheduled_seq_groups) > 0:
1316
+ using_prompt_embeds = scheduled_seq_groups[
1317
+ 0].seq_group.uses_prompt_embeds()
1318
+ ignored_seq_groups_for_embeds.clear()
1319
+ indices_ignored = list[int]()
1320
+ for i, schedule_seq_group in enumerate(scheduled_seq_groups):
1321
+ if using_prompt_embeds !=\
1322
+ schedule_seq_group.seq_group.uses_prompt_embeds():
1323
+ ignored_seq_groups_for_embeds.append(
1324
+ schedule_seq_group.seq_group)
1325
+ indices_ignored.append(i)
1326
+ if len(ignored_seq_groups_for_embeds) > 0:
1327
+ scheduled_seq_groups = [
1328
+ group for i, group in enumerate(scheduled_seq_groups)
1329
+ if i not in indices_ignored
1330
+ ]
1331
+ else:
1332
+ ignored_seq_groups_for_embeds.clear()
1333
+
1334
+ scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
1335
+
1336
+ blocks_to_copy = running_scheduled.blocks_to_copy
1337
+ blocks_to_copy.extend(swapped_in.blocks_to_copy)
1338
+
1339
+ ignored_seq_groups = prefills.ignored_seq_groups
1340
+ ignored_seq_groups.extend(ignored_seq_groups_for_embeds)
1341
+ ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
1342
+
1343
+ return SchedulerOutputs(
1344
+ scheduled_seq_groups=scheduled_seq_groups,
1345
+ num_prefill_groups=num_prefill_groups,
1346
+ num_batched_tokens=budget.num_batched_tokens +
1347
+ budget.num_cached_tokens,
1348
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1349
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1350
+ blocks_to_copy=blocks_to_copy,
1351
+ ignored_seq_groups=ignored_seq_groups,
1352
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
1353
+ running_queue_size=len(self.running),
1354
+ preempted=preempted,
1355
+ )
1356
+
1357
+ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
1358
+ """Schedule queued requests.
1359
+
1360
+ Chunked prefill allows to chunk prefill requests, batch them together
1361
+ with decode requests. This policy 1. schedule as many decoding requests
1362
+ as possible. 2. schedule chunked prefill requests that are not
1363
+ finished. 3. schedule swapped request. 4. schedule new prefill
1364
+ requests.
1365
+
1366
+ The policy can sustain the high GPU utilization because it can put
1367
+ prefill and decodes requests to the same batch, while it improves
1368
+ inter token latency because decodes requests don't need to be blocked
1369
+ by prefill requests.
1370
+ """
1371
+ budget = SchedulingBudget(
1372
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1373
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1374
+ )
1375
+ curr_loras: Set[int] = set()
1376
+
1377
+ prefills = SchedulerPrefillOutputs.create_empty()
1378
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1379
+
1380
+ # Create partial prefill metadata
1381
+ partial_prefill_metadata = PartialPrefillMetadata.from_queues(
1382
+ running=self.running,
1383
+ waiting=self.waiting,
1384
+ scheduler_config=self.scheduler_config,
1385
+ )
1386
+
1387
+ # Decoding should be always scheduled first by fcfs.
1388
+ running_scheduled = self._schedule_running(
1389
+ budget,
1390
+ curr_loras,
1391
+ enable_chunking=True,
1392
+ partial_prefill_metadata=partial_prefill_metadata,
1393
+ )
1394
+
1395
+ # Schedule swapped out requests.
1396
+ # If preemption happens, it means we don't have space for swap-in.
1397
+ if len(running_scheduled.preempted) + len(
1398
+ running_scheduled.swapped_out) == 0:
1399
+ swapped_in = self._schedule_swapped(budget, curr_loras)
1400
+
1401
+ prefills = self._schedule_prefills(
1402
+ budget,
1403
+ curr_loras,
1404
+ enable_chunking=True,
1405
+ partial_prefill_metadata=partial_prefill_metadata,
1406
+ )
1407
+
1408
+ assert (budget.num_batched_tokens
1409
+ <= self.scheduler_config.max_num_batched_tokens)
1410
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1411
+
1412
+ # Update waiting requests.
1413
+ self.waiting.extendleft(running_scheduled.preempted)
1414
+
1415
+ # Update new running requests.
1416
+ # By default, vLLM scheduler prioritizes prefills.
1417
+ # Once chunked prefill is enabled,
1418
+ # the policy is changed to prioritize decode requests.
1419
+ self.running.extend(
1420
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1421
+ self.running.extend(
1422
+ [s.seq_group for s in swapped_in.prefill_seq_groups])
1423
+ self.running.extend(
1424
+ [s.seq_group for s in running_scheduled.decode_seq_groups])
1425
+ # Because multiple prefills may be running concurrently, we need to
1426
+ # make sure that prefills which are scheduled to finish are listed
1427
+ # before those that won't. This is so that on the next scheduling
1428
+ # iteration when they have transitioned to the decode stage, they are
1429
+ # properly prioritized over sequences that are still in the prefill
1430
+ # stage.
1431
+ self.running.extend(
1432
+ self._order_finishing_prefills_first(
1433
+ running_scheduled.prefill_seq_groups))
1434
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1435
+
1436
+ # Update swapped requests.
1437
+ self.swapped.extend(running_scheduled.swapped_out)
1438
+ # Put prefills first due to Attention backend ordering assumption.
1439
+ scheduled_seq_groups = (prefills.seq_groups +
1440
+ running_scheduled.prefill_seq_groups +
1441
+ swapped_in.prefill_seq_groups +
1442
+ running_scheduled.decode_seq_groups +
1443
+ swapped_in.decode_seq_groups)
1444
+ num_prefill_groups = (len(prefills.seq_groups) +
1445
+ len(swapped_in.prefill_seq_groups) +
1446
+ len(running_scheduled.prefill_seq_groups))
1447
+ # If all prompts, then we set num_lookahead_slots to 0
1448
+ # this allows us to go through the `no_spec` path in
1449
+ # `spec_decode_worker.py`
1450
+ all_prefills = len(scheduled_seq_groups) == num_prefill_groups
1451
+ num_lookahead_slots = (0 if
1452
+ (all_prefills
1453
+ and not self.scheduler_config.is_multi_step)
1454
+ else running_scheduled.num_lookahead_slots)
1455
+ return SchedulerOutputs(
1456
+ scheduled_seq_groups=scheduled_seq_groups,
1457
+ num_prefill_groups=num_prefill_groups,
1458
+ num_batched_tokens=budget.num_batched_tokens +
1459
+ budget.num_cached_tokens,
1460
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1461
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1462
+ blocks_to_copy=running_scheduled.blocks_to_copy +
1463
+ swapped_in.blocks_to_copy,
1464
+ ignored_seq_groups=prefills.ignored_seq_groups +
1465
+ swapped_in.infeasible_seq_groups,
1466
+ num_lookahead_slots=num_lookahead_slots,
1467
+ running_queue_size=len(self.running),
1468
+ preempted=(len(running_scheduled.preempted) +
1469
+ len(running_scheduled.swapped_out)),
1470
+ )
1471
+
1472
+ def _order_finishing_prefills_first(
1473
+ self, scheduled_prefill_seqs: List[ScheduledSequenceGroup]
1474
+ ) -> List[SequenceGroup]:
1475
+ """Returns a list of prefilling SequenceGroups where sequences that are
1476
+ scheduled to finish prefilling are listed first"""
1477
+ finishing = [
1478
+ s.seq_group for s in scheduled_prefill_seqs
1479
+ if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size
1480
+ ]
1481
+ not_finishing = [
1482
+ s.seq_group for s in scheduled_prefill_seqs
1483
+ if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size
1484
+ ]
1485
+ return finishing + not_finishing
1486
+
1487
+ def _schedule(self) -> SchedulerOutputs:
1488
+ """Schedule queued requests."""
1489
+ if self.scheduler_config.chunked_prefill_enabled:
1490
+ return self._schedule_chunked_prefill()
1491
+ else:
1492
+ return self._schedule_default()
1493
+
1494
+ def _can_append_slots(self, seq_group: SequenceGroup,
1495
+ enable_chunking: bool) -> bool:
1496
+ """Determine whether or not we have enough space in the KV cache to
1497
+ continue generation of the sequence group.
1498
+ """
1499
+ # It is True only for testing case to trigger artificial preemption.
1500
+ if (self.enable_artificial_preemption
1501
+ and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
1502
+ and self.artificial_preempt_cnt > 0):
1503
+ self.artificial_preempt_cnt -= 1
1504
+ return False
1505
+
1506
+ is_prefill = seq_group.is_prefill()
1507
+ num_lookahead_slots = self._get_num_lookahead_slots(
1508
+ is_prefill, enable_chunking)
1509
+
1510
+ if is_prefill and num_lookahead_slots > 0:
1511
+ # Appending prefill slots only happens multi-step and
1512
+ # chunked-prefill are enabled together.
1513
+ assert self.scheduler_config.is_multi_step and enable_chunking
1514
+
1515
+ return self.block_manager.can_append_slots(
1516
+ seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
1517
+
1518
+ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
1519
+ # async_output_proc is allowed only when we have a single sequence
1520
+ # in the sequence group
1521
+ no_single_seq = seq_group.sampling_params is None or (
1522
+ seq_group.sampling_params.n == 1)
1523
+ return no_single_seq
1524
+
1525
+ def schedule(
1526
+ self
1527
+ ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
1528
+ # Schedule sequence groups.
1529
+ # This function call changes the internal states of the scheduler
1530
+ # such as self.running, self.swapped, and self.waiting.
1531
+ scheduler_start_time = time.perf_counter()
1532
+
1533
+ scheduler_outputs: SchedulerOutputs = self._schedule()
1534
+ now = time.time()
1535
+
1536
+ if not self.cache_config.enable_prefix_caching:
1537
+ common_computed_block_nums = []
1538
+
1539
+ allow_async_output_proc: bool = self.use_async_output_proc
1540
+
1541
+ # Create input data structures.
1542
+ seq_group_metadata_list: List[SequenceGroupMetadata] = []
1543
+ for i, scheduled_seq_group in enumerate(
1544
+ scheduler_outputs.scheduled_seq_groups):
1545
+ seq_group = scheduled_seq_group.seq_group
1546
+ token_chunk_size = scheduled_seq_group.token_chunk_size
1547
+ seq_group.maybe_set_first_scheduled_time(now)
1548
+
1549
+ seq_group_metadata = self._seq_group_metadata_cache[
1550
+ self.cache_id].get_object()
1551
+ seq_group_metadata.seq_data.clear()
1552
+ seq_group_metadata.block_tables.clear()
1553
+
1554
+ # seq_id -> SequenceData
1555
+ seq_data: Dict[int, SequenceData] = {}
1556
+ # seq_id -> physical block numbers
1557
+ block_tables: Dict[int, List[int]] = {}
1558
+
1559
+ if seq_group.is_encoder_decoder():
1560
+ # Encoder associated with SequenceGroup
1561
+ encoder_seq = seq_group.get_encoder_seq()
1562
+ assert encoder_seq is not None
1563
+ encoder_seq_data = encoder_seq.data
1564
+ # Block table for cross-attention
1565
+ # Also managed at SequenceGroup level
1566
+ cross_block_table = self.block_manager.get_cross_block_table(
1567
+ seq_group)
1568
+ else:
1569
+ encoder_seq_data = None
1570
+ cross_block_table = None
1571
+
1572
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1573
+ seq_id = seq.seq_id
1574
+ seq_data[seq_id] = seq.data
1575
+ block_tables[seq_id] = self.block_manager.get_block_table(seq)
1576
+ self.block_manager.access_all_blocks_in_seq(seq, now)
1577
+
1578
+ if self.cache_config.enable_prefix_caching:
1579
+ common_computed_block_nums = (
1580
+ self.block_manager.get_common_computed_block_ids(
1581
+ seq_group.get_seqs(status=SequenceStatus.RUNNING)))
1582
+
1583
+ do_sample = True
1584
+ is_prompt = seq_group.is_prefill()
1585
+ # We should send the metadata to workers when the first prefill
1586
+ # is sent. Subsequent requests could be chunked prefill or decode.
1587
+ is_first_prefill = False
1588
+ if is_prompt:
1589
+ seqs = seq_group.get_seqs()
1590
+ # Prefill has only 1 sequence.
1591
+ assert len(seqs) == 1
1592
+ num_computed_tokens = seqs[0].data.get_num_computed_tokens()
1593
+ is_first_prefill = num_computed_tokens == 0
1594
+ # In the next iteration, all prompt tokens are not computed.
1595
+ # It means the prefill is chunked, and we don't need sampling.
1596
+ # NOTE: We use get_len instead of get_prompt_len because when
1597
+ # a sequence is preempted, prefill includes previous generated
1598
+ # output tokens.
1599
+ if (token_chunk_size + num_computed_tokens
1600
+ < seqs[0].data.get_len()):
1601
+ do_sample = False
1602
+
1603
+ # It assumes the scheduled_seq_groups is ordered by
1604
+ # prefill < decoding.
1605
+ if is_first_prefill or not self.scheduler_config.send_delta_data:
1606
+ seq_group_metadata = SequenceGroupMetadata(
1607
+ request_id=seq_group.request_id,
1608
+ is_prompt=is_prompt,
1609
+ seq_data=seq_data,
1610
+ sampling_params=seq_group.sampling_params,
1611
+ block_tables=block_tables,
1612
+ do_sample=do_sample,
1613
+ pooling_params=seq_group.pooling_params,
1614
+ token_chunk_size=token_chunk_size,
1615
+ lora_request=seq_group.lora_request,
1616
+ computed_block_nums=common_computed_block_nums,
1617
+ encoder_seq_data=encoder_seq_data,
1618
+ cross_block_table=cross_block_table,
1619
+ state=seq_group.state,
1620
+ token_type_ids=seq_group.token_type_ids,
1621
+ # `multi_modal_data` will only be present for the 1st comm
1622
+ # between engine and worker.
1623
+ # the subsequent comms can still use delta, but
1624
+ # `multi_modal_data` will be None.
1625
+ multi_modal_data=(seq_group.multi_modal_data
1626
+ if scheduler_outputs.num_prefill_groups
1627
+ > 0 else None),
1628
+ multi_modal_placeholders=(
1629
+ seq_group.multi_modal_placeholders
1630
+ if scheduler_outputs.num_prefill_groups > 0 else None),
1631
+ prompt_adapter_request=seq_group.prompt_adapter_request,
1632
+ )
1633
+ else:
1634
+ # When SPMD mode is enabled, we only send delta data except for
1635
+ # the first request to reduce serialization cost.
1636
+ seq_data_delta = {}
1637
+ for id, data in seq_data.items():
1638
+ seq_data_delta[id] = data.get_delta_and_reset()
1639
+ seq_group_metadata = SequenceGroupMetadataDelta(
1640
+ seq_data_delta,
1641
+ seq_group.request_id,
1642
+ block_tables,
1643
+ is_prompt,
1644
+ do_sample=do_sample,
1645
+ token_chunk_size=token_chunk_size,
1646
+ computed_block_nums=common_computed_block_nums,
1647
+ )
1648
+ seq_group_metadata_list.append(seq_group_metadata)
1649
+
1650
+ if allow_async_output_proc:
1651
+ allow_async_output_proc = self._allow_async_output_proc(
1652
+ seq_group)
1653
+
1654
+ # Now that the batch has been created, we can assume all blocks in the
1655
+ # batch will have been computed before the next scheduling invocation.
1656
+ # This is because the engine assumes that a failure in model execution
1657
+ # will crash the vLLM instance / will not retry.
1658
+ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
1659
+ self.block_manager.mark_blocks_as_computed(
1660
+ scheduled_seq_group.seq_group,
1661
+ scheduled_seq_group.token_chunk_size)
1662
+
1663
+ self._seq_group_metadata_cache[self.next_cache_id].reset()
1664
+
1665
+ scheduler_time = time.perf_counter() - scheduler_start_time
1666
+ # Add this to scheduler time to all the sequences that are currently
1667
+ # running. This will help estimate if the scheduler is a significant
1668
+ # component in the e2e latency.
1669
+ for seq_group in self.running:
1670
+ if seq_group is not None and seq_group.metrics is not None:
1671
+ if seq_group.metrics.scheduler_time is not None:
1672
+ seq_group.metrics.scheduler_time += scheduler_time
1673
+ else:
1674
+ seq_group.metrics.scheduler_time = scheduler_time
1675
+
1676
+ # Move to next cache (if exists)
1677
+ self.cache_id = self.next_cache_id
1678
+
1679
+ # Return results
1680
+ return (seq_group_metadata_list, scheduler_outputs,
1681
+ allow_async_output_proc)
1682
+
1683
+ def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
1684
+ self.block_manager.fork(parent_seq, child_seq)
1685
+
1686
+ def free_seq(self, seq: Sequence) -> None:
1687
+ """Free a sequence from a block table."""
1688
+ self.block_manager.free(seq)
1689
+
1690
+ def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
1691
+ """Free finished seqs in a sequence group."""
1692
+ for seq in seq_group.get_seqs():
1693
+ if seq.is_finished():
1694
+ self.free_seq(seq)
1695
+
1696
+ def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
1697
+ if seq_group.is_finished():
1698
+ # Free cross-attention block table, if it exists
1699
+ self._free_seq_group_cross_attn_blocks(seq_group)
1700
+
1701
+ # Add the finished requests to the finished requests list.
1702
+ # This list will be used to update the Mamba cache in the
1703
+ # next step.
1704
+ self._finished_requests_ids.append(seq_group.request_id)
1705
+
1706
+ # Free finished seqs
1707
+ self._free_finished_seqs(seq_group)
1708
+
1709
+ def free_finished_seq_groups(self) -> None:
1710
+ remaining: Deque[SequenceGroup] = deque()
1711
+ for seq_group in self.running:
1712
+ self._free_finished_seq_group(seq_group)
1713
+ if not seq_group.is_finished():
1714
+ remaining.append(seq_group)
1715
+
1716
+ self.running = remaining
1717
+
1718
+ # Handle async stopped sequence groups
1719
+ # (ones that reached max model len)
1720
+ if self._async_stopped:
1721
+ for seq_group in self._async_stopped:
1722
+ self._free_seq_group_cross_attn_blocks(seq_group)
1723
+ self._finished_requests_ids.append(seq_group.request_id)
1724
+
1725
+ # Free finished seqs
1726
+ self._free_finished_seqs(seq_group)
1727
+
1728
+ self._async_stopped.clear()
1729
+
1730
+ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
1731
+ self.block_manager.allocate(seq_group)
1732
+ for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
1733
+ seq.status = SequenceStatus.RUNNING
1734
+
1735
+ def _append_slots(
1736
+ self,
1737
+ seq_group: SequenceGroup,
1738
+ blocks_to_copy: List[Tuple[int, int]],
1739
+ enable_chunking: bool = False,
1740
+ ) -> None:
1741
+ """Appends new slots to the sequences in the given sequence group.
1742
+
1743
+ Args:
1744
+ seq_group (SequenceGroup): The sequence group containing the
1745
+ sequences to append slots to.
1746
+ blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
1747
+ ints, the first int is the source block index, and the second
1748
+ int is the destination block index. This list is updated with
1749
+ the new source and destination block indices for the appended
1750
+ slots.
1751
+ enable_chunking (bool): True if chunked prefill is enabled.
1752
+ """
1753
+ is_prefill: bool = seq_group.is_prefill()
1754
+ num_lookahead_slots: int = self._get_num_lookahead_slots(
1755
+ is_prefill, enable_chunking)
1756
+
1757
+ seq_group.init_multi_step_from_lookahead_slots(
1758
+ num_lookahead_slots,
1759
+ num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
1760
+ is_multi_step=self.scheduler_config.is_multi_step,
1761
+ enable_chunking=enable_chunking,
1762
+ )
1763
+
1764
+ seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
1765
+ if self.scheduler_config.is_multi_step and enable_chunking:
1766
+ # In multi-step chunked-prefill any sequence type can have
1767
+ # slots appended.
1768
+ seq_status = None
1769
+
1770
+ for seq in seq_group.get_seqs(status=seq_status):
1771
+ cows = self.block_manager.append_slots(seq, num_lookahead_slots)
1772
+ if len(cows) > 0:
1773
+ blocks_to_copy.extend(cows)
1774
+
1775
+ def _preempt(self, seq_group: SequenceGroup,
1776
+ blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
1777
+ # If preemption mode is not specified, we determine the mode as follows:
1778
+ # We use recomputation by default since it incurs lower overhead than
1779
+ # swapping. However, when the sequence group has multiple sequences
1780
+ # (e.g., beam search), recomputation is not currently supported. In
1781
+ # such a case, we use swapping instead.
1782
+ # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
1783
+ # As swapped sequences are prioritized over waiting sequences,
1784
+ # sequence groups with multiple sequences are implicitly prioritized
1785
+ # over sequence groups with a single sequence.
1786
+ # TODO(woosuk): Support recomputation for sequence groups with multiple
1787
+ # sequences. This may require a more sophisticated CUDA kernel.
1788
+ if self.user_specified_preemption_mode is None:
1789
+ if seq_group.get_max_num_running_seqs() == 1:
1790
+ preemption_mode = PreemptionMode.RECOMPUTE
1791
+ else:
1792
+ preemption_mode = PreemptionMode.SWAP
1793
+
1794
+ elif self.user_specified_preemption_mode == "swap":
1795
+ preemption_mode = PreemptionMode.SWAP
1796
+ else:
1797
+ preemption_mode = PreemptionMode.RECOMPUTE
1798
+
1799
+ if self.num_cumulative_preemption % 50 == 0:
1800
+ logger.warning(
1801
+ "Sequence group %s is preempted by %s mode because there is "
1802
+ "not enough KV cache space. This can affect the end-to-end "
1803
+ "performance. Increase gpu_memory_utilization or "
1804
+ "tensor_parallel_size to provide more KV cache memory. "
1805
+ "total_num_cumulative_preemption=%d",
1806
+ seq_group.request_id,
1807
+ preemption_mode,
1808
+ self.num_cumulative_preemption + 1,
1809
+ )
1810
+ self.num_cumulative_preemption += 1
1811
+
1812
+ if preemption_mode == PreemptionMode.RECOMPUTE:
1813
+ self._preempt_by_recompute(seq_group)
1814
+ elif preemption_mode == PreemptionMode.SWAP:
1815
+ self._preempt_by_swap(seq_group, blocks_to_swap_out)
1816
+ else:
1817
+ raise AssertionError("Invalid preemption mode.")
1818
+ return preemption_mode
1819
+
1820
+ def _preempt_by_recompute(
1821
+ self,
1822
+ seq_group: SequenceGroup,
1823
+ ) -> None:
1824
+ seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
1825
+ assert len(seqs) == 1
1826
+ for seq in seqs:
1827
+ seq.status = SequenceStatus.WAITING
1828
+ self.free_seq(seq)
1829
+ seq.reset_state_for_recompute()
1830
+ self._free_seq_group_cross_attn_blocks(seq_group)
1831
+
1832
+ def _preempt_by_swap(
1833
+ self,
1834
+ seq_group: SequenceGroup,
1835
+ blocks_to_swap_out: List[Tuple[int, int]],
1836
+ ) -> None:
1837
+ self._swap_out(seq_group, blocks_to_swap_out)
1838
+
1839
+ def _swap_in(
1840
+ self,
1841
+ seq_group: SequenceGroup,
1842
+ blocks_to_swap_in: List[Tuple[int, int]],
1843
+ ) -> None:
1844
+ mapping = self.block_manager.swap_in(seq_group)
1845
+ blocks_to_swap_in.extend(mapping)
1846
+ for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
1847
+ seq.status = SequenceStatus.RUNNING
1848
+
1849
+ def _swap_out(
1850
+ self,
1851
+ seq_group: SequenceGroup,
1852
+ blocks_to_swap_out: List[Tuple[int, int]],
1853
+ ) -> None:
1854
+ if not self.block_manager.can_swap_out(seq_group):
1855
+ # FIXME(woosuk): Abort the sequence group instead of aborting the
1856
+ # entire engine.
1857
+ raise RuntimeError(
1858
+ "Aborted due to the lack of CPU swap space. Please increase "
1859
+ "the swap space to avoid this error.")
1860
+ mapping = self.block_manager.swap_out(seq_group)
1861
+ blocks_to_swap_out.extend(mapping)
1862
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1863
+ seq.status = SequenceStatus.SWAPPED
1864
+
1865
+ def _passed_delay(self, now: float) -> bool:
1866
+ if self.prev_prompt:
1867
+ self.last_prompt_latency = now - self.prev_time
1868
+ self.prev_time, self.prev_prompt = now, False
1869
+ # Delay scheduling prompts to let waiting queue fill up
1870
+ if self.scheduler_config.delay_factor > 0 and self.waiting:
1871
+ earliest_arrival_time = min(
1872
+ [e.metrics.arrival_time for e in self.waiting])
1873
+ passed_delay = ((now - earliest_arrival_time)
1874
+ > (self.scheduler_config.delay_factor *
1875
+ self.last_prompt_latency) or not self.running)
1876
+ else:
1877
+ passed_delay = True
1878
+ return passed_delay
1879
+
1880
+ def _get_num_lookahead_slots(self, is_prefill: bool,
1881
+ enable_chunking: bool) -> int:
1882
+ """The number of slots to allocate per sequence per step, beyond known
1883
+ token ids. Speculative decoding uses these slots to store KV activations
1884
+ of tokens which may or may not be accepted.
1885
+
1886
+ Speculative decoding does not yet support prefill, so we do not perform
1887
+ lookahead allocation for prefill.
1888
+
1889
+ When chunking is enabled with multi-step, we allocate lookahead slots
1890
+ for the prefills for when the prefills turn into decodes in the first
1891
+ step.
1892
+ """
1893
+ if is_prefill:
1894
+ if self.scheduler_config.is_multi_step and enable_chunking:
1895
+ # num_lookahead_slots was introduced in the context of decodes,
1896
+ # in Speculative Decoding.
1897
+ # When the num_scheduler_steps is 8, say, then the
1898
+ # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
1899
+ # decode anyways and we wish to do 7 more.
1900
+ #
1901
+ # "lookaheads" for prefills, is introduced in support for
1902
+ # Chunked-Prefill in Multi-Step.
1903
+ return self.scheduler_config.num_lookahead_slots + 1
1904
+ else:
1905
+ return 0
1906
+
1907
+ return self.scheduler_config.num_lookahead_slots
1908
+
1909
+ def _get_num_new_uncached_and_cached_tokens(
1910
+ self,
1911
+ seq_group: SequenceGroup,
1912
+ status: SequenceStatus,
1913
+ enable_chunking: bool,
1914
+ budget: SchedulingBudget,
1915
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1916
+ ) -> Tuple[int, int]:
1917
+ """
1918
+ Returns the number of new uncached and cached tokens to schedule for a
1919
+ given sequence group that's in a given `status`.
1920
+
1921
+ The API could chunk the number of tokens to compute based on `budget`
1922
+ if `enable_chunking` is True. If a sequence group has multiple
1923
+ sequences (e.g., running beam search), it means it is in decoding
1924
+ phase, so chunking doesn't happen.
1925
+
1926
+ Returns (0, 0) if the new token cannot be computed due to token budget.
1927
+
1928
+ The cached tokens's blocks are already computed, and the attention
1929
+ backend will reuse the cached blocks rather than recomputing them. So
1930
+ the scheduler could schedule these cached tokens "for free".
1931
+
1932
+ Args:
1933
+ seq_group: The sequence group to get the number of new tokens to
1934
+ schedule.
1935
+ status: The status of the sequences to get the number of new tokens
1936
+ to schedule.
1937
+ enable_chunking: Whether to chunk the number of tokens to compute.
1938
+ budget: The budget to chunk the number of tokens to compute.
1939
+ partial_prefill_metadata: information about the partial prefills
1940
+ that are currently running
1941
+
1942
+
1943
+ Returns:
1944
+ A tuple of two ints. The first int is the number of new uncached
1945
+ tokens to schedule. The second int is the number of cached tokens.
1946
+ If no more new tokens can be scheduled, returns (0, 0).
1947
+ """
1948
+ num_cached_new_tokens = 0
1949
+ num_uncached_new_tokens = 0
1950
+
1951
+ seqs = seq_group.get_seqs(status=status)
1952
+ # Compute the number of new uncached and cached tokens for
1953
+ # each sequence.
1954
+ for seq in seqs:
1955
+ if not seq.is_prefill():
1956
+ # Decode sequences should always just have 1 uncached token
1957
+ # TODO(rickyx): Actually is this still correct for multi-step?
1958
+ num_uncached_new_tokens += 1
1959
+ continue
1960
+
1961
+ num_computed_tokens_seq = seq.get_num_computed_tokens()
1962
+ all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
1963
+ if not self.cache_config.enable_prefix_caching:
1964
+ # If prefix caching is not enabled, all new tokens are uncached.
1965
+ num_uncached_new_tokens += all_num_new_tokens_seq
1966
+ continue
1967
+
1968
+ # NOTE: the cache token might be currently in a block that's in an
1969
+ # evictor meaning that it's not yet allocated. However, we don't
1970
+ # exclude such tokens in the cache count because it will be
1971
+ # guaranteed to be allocated later if the sequence can be allocated.
1972
+ num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
1973
+ seq)
1974
+
1975
+ # Sanity check.
1976
+ if num_cached_tokens_seq < num_computed_tokens_seq:
1977
+ # This should only happen with chunked prefill, and
1978
+ # the seq is still in prefill. The `num_cached_tokens_seq`
1979
+ # is the value we calculated on scheduling the first prefill.
1980
+ # For subsequent continuous prefill steps, we cached the
1981
+ # number of cache tokens for the sequence so the cached token
1982
+ # count could be less than the number of computed tokens.
1983
+ # See comments on `ComputedBlocksTracker` for more details.
1984
+ assert (
1985
+ seq.is_prefill() and seq.status == SequenceStatus.RUNNING
1986
+ and self.scheduler_config.chunked_prefill_enabled
1987
+ ), ("Number of cached tokens should not be less than the "
1988
+ "number of computed tokens for a sequence that's still "
1989
+ f"in prefill. But there are {num_cached_tokens_seq} cached "
1990
+ f"tokens and {num_computed_tokens_seq} computed tokens "
1991
+ f"for sequence {seq.seq_id}.")
1992
+
1993
+ num_cached_new_tokens_seq = max(
1994
+ 0, num_cached_tokens_seq - num_computed_tokens_seq)
1995
+ num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
1996
+ num_cached_new_tokens_seq)
1997
+
1998
+ num_uncached_new_tokens += num_uncached_new_tokens_seq
1999
+ num_cached_new_tokens += num_cached_new_tokens_seq
2000
+
2001
+ if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
2002
+ # For a fully cached hit sequence, we actually need to recompute the
2003
+ # last token. So we need at least 1 uncached token to schedule.
2004
+ # See ModelRunner._compute_for_prefix_cache_hit for more details.
2005
+ num_uncached_new_tokens = 1
2006
+ num_cached_new_tokens -= 1
2007
+
2008
+ if enable_chunking and len(seqs) == 1:
2009
+ # Chunk if a running request cannot fit in the given budget.
2010
+ # If number of seq > 1, it means it is doing beam search
2011
+ # in a decode phase. Do not chunk.
2012
+ num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
2013
+ self.scheduler_config,
2014
+ self.cache_config,
2015
+ budget,
2016
+ self._get_prompt_limit(seq_group),
2017
+ num_uncached_new_tokens,
2018
+ self.partial_prefill_budget_lookup_list,
2019
+ partial_prefill_metadata,
2020
+ )
2021
+
2022
+ return num_uncached_new_tokens, num_cached_new_tokens
2023
+
2024
+ @staticmethod
2025
+ def _chunk_new_tokens_to_schedule(
2026
+ scheduler_config: SchedulerConfig,
2027
+ cache_config: CacheConfig,
2028
+ budget: SchedulingBudget,
2029
+ prompt_limit: int,
2030
+ num_new_tokens: int,
2031
+ partial_prefill_budget_lookup_list: List[int],
2032
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
2033
+ ) -> int:
2034
+ """
2035
+ Chunks the number of new tokens to schedule based on the budget when
2036
+ chunked prefill is enabled.
2037
+
2038
+ Args:
2039
+ scheduler_config: The scheduler config.
2040
+ cache_config: The cache config.
2041
+ budget: The budget to chunk the number of tokens to compute.
2042
+ prompt_limit: The maximum number of tokens allowed in a prompt.
2043
+ num_new_tokens: The number of new tokens to schedule.
2044
+
2045
+ Returns:
2046
+ The number of new tokens to schedule after chunking.
2047
+ """
2048
+ remaining_token_budget = budget.remaining_token_budget()
2049
+ if scheduler_config.is_multi_step:
2050
+ # The current multi-step + chunked prefill capability does
2051
+ # not actually support chunking prompts.
2052
+ #
2053
+ # Therefore, `num_new_tokens` is computed in the same fashion
2054
+ # for both multi-step+chunked-prefill &
2055
+ # multi-step+chunked-prefill+APC
2056
+ #
2057
+ # Prompts with more tokens than the current remaining budget
2058
+ # are postponed to future scheduler steps
2059
+ if num_new_tokens > prompt_limit:
2060
+ # If the seq_group is in prompt-stage, pass the
2061
+ # num_new_tokens as-is so the caller can ignore
2062
+ # the sequence.
2063
+ return num_new_tokens
2064
+
2065
+ return 0 if num_new_tokens > \
2066
+ remaining_token_budget else num_new_tokens
2067
+
2068
+ # Get the number of tokens to allocate to this prefill slot
2069
+ prefill_slot_budget = (
2070
+ remaining_token_budget if partial_prefill_metadata is None else
2071
+ partial_prefill_budget_lookup_list[
2072
+ partial_prefill_metadata.schedulable_prefills])
2073
+
2074
+ if cache_config.enable_prefix_caching:
2075
+ # When prefix caching is enabled and we're partially prefilling
2076
+ # a sequence, we always allocate a number of new tokens that is
2077
+ # divisible by the block size to avoid partial block matching.
2078
+ block_size = cache_config.block_size
2079
+ # Don't exceed either the total budget or slot budget.
2080
+ # Take min of those and get the next lowest multiple of the
2081
+ # block size:
2082
+ remaining_token_budget = (
2083
+ min(remaining_token_budget, prefill_slot_budget) //
2084
+ block_size) * block_size
2085
+ # NB: In the case where num_new_tokens < budget, we are
2086
+ # finishing prefill for this sequence, so we do not need to
2087
+ # allocate a full block.
2088
+
2089
+ num_new_tokens = min(num_new_tokens, remaining_token_budget,
2090
+ prefill_slot_budget)
2091
+
2092
+ return num_new_tokens