vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1197) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +53 -0
  3. vllm/_custom_ops.py +1828 -0
  4. vllm/_ipex_ops.py +244 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +115 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +308 -0
  20. vllm/attention/backends/blocksparse_attn.py +461 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1498 -0
  23. vllm/attention/backends/flash_attn.py +1003 -0
  24. vllm/attention/backends/flashinfer.py +1104 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +313 -0
  27. vllm/attention/backends/ipex_attn.py +398 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1385 -0
  30. vllm/attention/backends/pallas.py +351 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +975 -0
  34. vllm/attention/backends/torch_sdpa.py +703 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +802 -0
  38. vllm/attention/layer.py +468 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +906 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/prefix_prefill.py +902 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  54. vllm/attention/ops/triton_decode_attention.py +674 -0
  55. vllm/attention/ops/triton_flash_attention.py +979 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  57. vllm/attention/ops/triton_unified_attention.py +334 -0
  58. vllm/attention/selector.py +187 -0
  59. vllm/attention/utils/fa_utils.py +55 -0
  60. vllm/beam_search.py +87 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +1185 -0
  63. vllm/benchmarks/endpoint_request_func.py +381 -0
  64. vllm/benchmarks/latency.py +168 -0
  65. vllm/benchmarks/serve.py +1135 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +70 -0
  68. vllm/collect_env.py +820 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +89 -0
  71. vllm/compilation/backends.py +563 -0
  72. vllm/compilation/base_piecewise_backend.py +72 -0
  73. vllm/compilation/collective_fusion.py +127 -0
  74. vllm/compilation/compiler_interface.py +544 -0
  75. vllm/compilation/counter.py +38 -0
  76. vllm/compilation/cuda_piecewise_backend.py +214 -0
  77. vllm/compilation/decorators.py +250 -0
  78. vllm/compilation/fix_functionalization.py +191 -0
  79. vllm/compilation/fusion.py +618 -0
  80. vllm/compilation/fx_utils.py +62 -0
  81. vllm/compilation/inductor_pass.py +115 -0
  82. vllm/compilation/monitor.py +39 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +137 -0
  85. vllm/compilation/pass_manager.py +78 -0
  86. vllm/compilation/sequence_parallelism.py +268 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +67 -0
  89. vllm/compilation/wrapper.py +135 -0
  90. vllm/config.py +4746 -0
  91. vllm/connections.py +174 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +399 -0
  95. vllm/core/block/common.py +371 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  97. vllm/core/block/interfaces.py +319 -0
  98. vllm/core/block/naive_block.py +466 -0
  99. vllm/core/block/prefix_caching_block.py +1135 -0
  100. vllm/core/block/utils.py +28 -0
  101. vllm/core/block_manager.py +521 -0
  102. vllm/core/evictor.py +157 -0
  103. vllm/core/interfaces.py +135 -0
  104. vllm/core/placeholder_block_space_manager.py +100 -0
  105. vllm/core/scheduler.py +2093 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +281 -0
  108. vllm/distributed/__init__.py +6 -0
  109. vllm/distributed/communication_op.py +41 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +264 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +176 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  120. vllm/distributed/device_communicators/pynccl.py +218 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +341 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  125. vllm/distributed/kv_events.py +356 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +12 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +128 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +108 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +134 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1030 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +384 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +280 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  152. vllm/distributed/parallel_state.py +1296 -0
  153. vllm/distributed/tpu_distributed_utils.py +177 -0
  154. vllm/distributed/utils.py +536 -0
  155. vllm/engine/__init__.py +0 -0
  156. vllm/engine/arg_utils.py +1708 -0
  157. vllm/engine/async_llm_engine.py +1200 -0
  158. vllm/engine/async_timeout.py +173 -0
  159. vllm/engine/llm_engine.py +2097 -0
  160. vllm/engine/metrics.py +629 -0
  161. vllm/engine/metrics_types.py +94 -0
  162. vllm/engine/multiprocessing/__init__.py +148 -0
  163. vllm/engine/multiprocessing/client.py +681 -0
  164. vllm/engine/multiprocessing/engine.py +460 -0
  165. vllm/engine/output_processor/__init__.py +0 -0
  166. vllm/engine/output_processor/interfaces.py +75 -0
  167. vllm/engine/output_processor/multi_step.py +216 -0
  168. vllm/engine/output_processor/single_step.py +145 -0
  169. vllm/engine/output_processor/stop_checker.py +131 -0
  170. vllm/engine/output_processor/util.py +28 -0
  171. vllm/engine/protocol.py +317 -0
  172. vllm/entrypoints/__init__.py +0 -0
  173. vllm/entrypoints/api_server.py +178 -0
  174. vllm/entrypoints/chat_utils.py +1299 -0
  175. vllm/entrypoints/cli/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  177. vllm/entrypoints/cli/benchmark/base.py +39 -0
  178. vllm/entrypoints/cli/benchmark/latency.py +30 -0
  179. vllm/entrypoints/cli/benchmark/main.py +54 -0
  180. vllm/entrypoints/cli/benchmark/serve.py +30 -0
  181. vllm/entrypoints/cli/benchmark/throughput.py +30 -0
  182. vllm/entrypoints/cli/collect_env.py +35 -0
  183. vllm/entrypoints/cli/main.py +65 -0
  184. vllm/entrypoints/cli/openai.py +205 -0
  185. vllm/entrypoints/cli/run_batch.py +62 -0
  186. vllm/entrypoints/cli/serve.py +328 -0
  187. vllm/entrypoints/cli/types.py +25 -0
  188. vllm/entrypoints/launcher.py +147 -0
  189. vllm/entrypoints/llm.py +1544 -0
  190. vllm/entrypoints/logger.py +50 -0
  191. vllm/entrypoints/openai/__init__.py +0 -0
  192. vllm/entrypoints/openai/api_server.py +1387 -0
  193. vllm/entrypoints/openai/cli_args.py +315 -0
  194. vllm/entrypoints/openai/logits_processors.py +90 -0
  195. vllm/entrypoints/openai/protocol.py +1913 -0
  196. vllm/entrypoints/openai/run_batch.py +463 -0
  197. vllm/entrypoints/openai/serving_chat.py +1221 -0
  198. vllm/entrypoints/openai/serving_classification.py +160 -0
  199. vllm/entrypoints/openai/serving_completion.py +592 -0
  200. vllm/entrypoints/openai/serving_embedding.py +201 -0
  201. vllm/entrypoints/openai/serving_engine.py +986 -0
  202. vllm/entrypoints/openai/serving_models.py +315 -0
  203. vllm/entrypoints/openai/serving_pooling.py +232 -0
  204. vllm/entrypoints/openai/serving_score.py +433 -0
  205. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  206. vllm/entrypoints/openai/serving_transcription.py +424 -0
  207. vllm/entrypoints/openai/tool_parsers/__init__.py +23 -0
  208. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  209. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  210. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  211. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  212. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  213. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  214. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  215. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  216. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  217. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  218. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  219. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  220. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  221. vllm/entrypoints/score_utils.py +50 -0
  222. vllm/entrypoints/ssl.py +75 -0
  223. vllm/entrypoints/utils.py +233 -0
  224. vllm/env_override.py +41 -0
  225. vllm/envs.py +944 -0
  226. vllm/executor/__init__.py +0 -0
  227. vllm/executor/executor_base.py +401 -0
  228. vllm/executor/mp_distributed_executor.py +244 -0
  229. vllm/executor/msgspec_utils.py +30 -0
  230. vllm/executor/multiproc_worker_utils.py +313 -0
  231. vllm/executor/ray_distributed_executor.py +701 -0
  232. vllm/executor/ray_utils.py +399 -0
  233. vllm/executor/uniproc_executor.py +139 -0
  234. vllm/forward_context.py +179 -0
  235. vllm/inputs/__init__.py +41 -0
  236. vllm/inputs/data.py +331 -0
  237. vllm/inputs/parse.py +151 -0
  238. vllm/inputs/preprocess.py +909 -0
  239. vllm/inputs/registry.py +237 -0
  240. vllm/jsontree.py +80 -0
  241. vllm/logger.py +212 -0
  242. vllm/logging_utils/__init__.py +8 -0
  243. vllm/logging_utils/dump_input.py +85 -0
  244. vllm/logging_utils/formatter.py +18 -0
  245. vllm/logits_process.py +119 -0
  246. vllm/lora/__init__.py +0 -0
  247. vllm/lora/fully_sharded_layers.py +355 -0
  248. vllm/lora/layers.py +1285 -0
  249. vllm/lora/lora.py +199 -0
  250. vllm/lora/models.py +818 -0
  251. vllm/lora/ops/__init__.py +0 -0
  252. vllm/lora/ops/torch_ops/__init__.py +16 -0
  253. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  254. vllm/lora/ops/triton_ops/__init__.py +12 -0
  255. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  256. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  257. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  258. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  259. vllm/lora/ops/triton_ops/utils.py +120 -0
  260. vllm/lora/ops/xla_ops/__init__.py +7 -0
  261. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  262. vllm/lora/peft_helper.py +136 -0
  263. vllm/lora/punica_wrapper/__init__.py +10 -0
  264. vllm/lora/punica_wrapper/punica_base.py +485 -0
  265. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  266. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  267. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  268. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  269. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  270. vllm/lora/punica_wrapper/utils.py +164 -0
  271. vllm/lora/request.py +99 -0
  272. vllm/lora/resolver.py +85 -0
  273. vllm/lora/utils.py +240 -0
  274. vllm/lora/worker_manager.py +259 -0
  275. vllm/model_executor/__init__.py +16 -0
  276. vllm/model_executor/custom_op.py +152 -0
  277. vllm/model_executor/guided_decoding/__init__.py +181 -0
  278. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  279. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  280. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  281. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  282. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  283. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  284. vllm/model_executor/guided_decoding/utils.py +242 -0
  285. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  286. vllm/model_executor/layers/__init__.py +0 -0
  287. vllm/model_executor/layers/activation.py +369 -0
  288. vllm/model_executor/layers/fused_moe/__init__.py +54 -0
  289. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +125 -0
  290. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +117 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  455. vllm/model_executor/layers/fused_moe/cutlass_moe.py +461 -0
  456. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +240 -0
  457. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +240 -0
  458. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +186 -0
  459. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +775 -0
  460. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +232 -0
  461. vllm/model_executor/layers/fused_moe/fused_moe.py +1724 -0
  462. vllm/model_executor/layers/fused_moe/layer.py +1535 -0
  463. vllm/model_executor/layers/fused_moe/modular_kernel.py +446 -0
  464. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  465. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  466. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  467. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  468. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +159 -0
  469. vllm/model_executor/layers/fused_moe/prepare_finalize.py +69 -0
  470. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +421 -0
  471. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +117 -0
  472. vllm/model_executor/layers/fused_moe/utils.py +98 -0
  473. vllm/model_executor/layers/layernorm.py +288 -0
  474. vllm/model_executor/layers/lightning_attn.py +652 -0
  475. vllm/model_executor/layers/linear.py +1524 -0
  476. vllm/model_executor/layers/logits_processor.py +197 -0
  477. vllm/model_executor/layers/mamba/__init__.py +0 -0
  478. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  479. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  480. vllm/model_executor/layers/mamba/mamba_mixer2.py +616 -0
  481. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  482. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  483. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  484. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  485. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  486. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  487. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  488. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  489. vllm/model_executor/layers/pooler.py +350 -0
  490. vllm/model_executor/layers/quantization/__init__.py +157 -0
  491. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  492. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  493. vllm/model_executor/layers/quantization/awq.py +194 -0
  494. vllm/model_executor/layers/quantization/awq_marlin.py +519 -0
  495. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  496. vllm/model_executor/layers/quantization/base_config.py +151 -0
  497. vllm/model_executor/layers/quantization/bitblas.py +461 -0
  498. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +668 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1260 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  505. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  506. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +93 -0
  507. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +178 -0
  508. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  509. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  510. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  511. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  512. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  513. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  514. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  515. vllm/model_executor/layers/quantization/experts_int8.py +196 -0
  516. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  517. vllm/model_executor/layers/quantization/fp8.py +906 -0
  518. vllm/model_executor/layers/quantization/gguf.py +565 -0
  519. vllm/model_executor/layers/quantization/gptq.py +278 -0
  520. vllm/model_executor/layers/quantization/gptq_bitblas.py +445 -0
  521. vllm/model_executor/layers/quantization/gptq_marlin.py +648 -0
  522. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  523. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  524. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  525. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  526. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  527. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  528. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  529. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  530. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  531. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +120 -0
  532. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  533. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  534. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  535. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  536. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  537. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  538. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  539. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  540. vllm/model_executor/layers/quantization/marlin.py +261 -0
  541. vllm/model_executor/layers/quantization/modelopt.py +737 -0
  542. vllm/model_executor/layers/quantization/moe_wna16.py +449 -0
  543. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  544. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  545. vllm/model_executor/layers/quantization/qqq.py +275 -0
  546. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  547. vllm/model_executor/layers/quantization/quark/quark.py +441 -0
  548. vllm/model_executor/layers/quantization/quark/quark_moe.py +237 -0
  549. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  550. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  551. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  552. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +146 -0
  553. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  554. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  555. vllm/model_executor/layers/quantization/schema.py +86 -0
  556. vllm/model_executor/layers/quantization/torchao.py +161 -0
  557. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  558. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  559. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  560. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/fp8_utils.py +618 -0
  764. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  765. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  766. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  767. vllm/model_executor/layers/quantization/utils/machete_utils.py +33 -0
  768. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  769. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  770. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  771. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  772. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  773. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  774. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  775. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +104 -0
  776. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  777. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  778. vllm/model_executor/layers/rejection_sampler.py +406 -0
  779. vllm/model_executor/layers/resampler.py +270 -0
  780. vllm/model_executor/layers/rotary_embedding.py +1862 -0
  781. vllm/model_executor/layers/sampler.py +1204 -0
  782. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  783. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  784. vllm/model_executor/layers/utils.py +95 -0
  785. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  786. vllm/model_executor/model_loader/__init__.py +76 -0
  787. vllm/model_executor/model_loader/base_loader.py +43 -0
  788. vllm/model_executor/model_loader/bitsandbytes_loader.py +570 -0
  789. vllm/model_executor/model_loader/default_loader.py +282 -0
  790. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  791. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  792. vllm/model_executor/model_loader/neuron.py +476 -0
  793. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  794. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  795. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  796. vllm/model_executor/model_loader/tensorizer.py +600 -0
  797. vllm/model_executor/model_loader/tensorizer_loader.py +123 -0
  798. vllm/model_executor/model_loader/tpu.py +112 -0
  799. vllm/model_executor/model_loader/utils.py +302 -0
  800. vllm/model_executor/model_loader/weight_utils.py +782 -0
  801. vllm/model_executor/models/__init__.py +28 -0
  802. vllm/model_executor/models/adapters.py +248 -0
  803. vllm/model_executor/models/aimv2.py +246 -0
  804. vllm/model_executor/models/arctic.py +559 -0
  805. vllm/model_executor/models/aria.py +657 -0
  806. vllm/model_executor/models/aya_vision.py +466 -0
  807. vllm/model_executor/models/baichuan.py +474 -0
  808. vllm/model_executor/models/bamba.py +543 -0
  809. vllm/model_executor/models/bart.py +938 -0
  810. vllm/model_executor/models/bert.py +523 -0
  811. vllm/model_executor/models/bert_with_rope.py +769 -0
  812. vllm/model_executor/models/blip.py +339 -0
  813. vllm/model_executor/models/blip2.py +718 -0
  814. vllm/model_executor/models/bloom.py +373 -0
  815. vllm/model_executor/models/chameleon.py +1136 -0
  816. vllm/model_executor/models/chatglm.py +478 -0
  817. vllm/model_executor/models/clip.py +407 -0
  818. vllm/model_executor/models/commandr.py +472 -0
  819. vllm/model_executor/models/constant_size_cache.py +137 -0
  820. vllm/model_executor/models/dbrx.py +472 -0
  821. vllm/model_executor/models/deepseek.py +486 -0
  822. vllm/model_executor/models/deepseek_mtp.py +269 -0
  823. vllm/model_executor/models/deepseek_v2.py +843 -0
  824. vllm/model_executor/models/deepseek_vl2.py +648 -0
  825. vllm/model_executor/models/eagle.py +260 -0
  826. vllm/model_executor/models/exaone.py +551 -0
  827. vllm/model_executor/models/fairseq2_llama.py +154 -0
  828. vllm/model_executor/models/falcon.py +510 -0
  829. vllm/model_executor/models/falcon_h1.py +685 -0
  830. vllm/model_executor/models/florence2.py +1103 -0
  831. vllm/model_executor/models/fuyu.py +389 -0
  832. vllm/model_executor/models/gemma.py +425 -0
  833. vllm/model_executor/models/gemma2.py +425 -0
  834. vllm/model_executor/models/gemma3.py +533 -0
  835. vllm/model_executor/models/gemma3_mm.py +709 -0
  836. vllm/model_executor/models/glm.py +23 -0
  837. vllm/model_executor/models/glm4.py +305 -0
  838. vllm/model_executor/models/glm4v.py +648 -0
  839. vllm/model_executor/models/gpt2.py +328 -0
  840. vllm/model_executor/models/gpt_bigcode.py +335 -0
  841. vllm/model_executor/models/gpt_j.py +339 -0
  842. vllm/model_executor/models/gpt_neox.py +332 -0
  843. vllm/model_executor/models/granite.py +493 -0
  844. vllm/model_executor/models/granite_speech.py +779 -0
  845. vllm/model_executor/models/granitemoe.py +437 -0
  846. vllm/model_executor/models/granitemoehybrid.py +586 -0
  847. vllm/model_executor/models/granitemoeshared.py +341 -0
  848. vllm/model_executor/models/gritlm.py +224 -0
  849. vllm/model_executor/models/grok1.py +546 -0
  850. vllm/model_executor/models/h2ovl.py +546 -0
  851. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  852. vllm/model_executor/models/idefics3.py +776 -0
  853. vllm/model_executor/models/interfaces.py +572 -0
  854. vllm/model_executor/models/interfaces_base.py +164 -0
  855. vllm/model_executor/models/intern_vit.py +480 -0
  856. vllm/model_executor/models/internlm2.py +455 -0
  857. vllm/model_executor/models/internlm2_ve.py +147 -0
  858. vllm/model_executor/models/internvl.py +1418 -0
  859. vllm/model_executor/models/jais.py +373 -0
  860. vllm/model_executor/models/jamba.py +592 -0
  861. vllm/model_executor/models/kimi_vl.py +577 -0
  862. vllm/model_executor/models/llama.py +644 -0
  863. vllm/model_executor/models/llama4.py +532 -0
  864. vllm/model_executor/models/llama_eagle.py +165 -0
  865. vllm/model_executor/models/llama_eagle3.py +263 -0
  866. vllm/model_executor/models/llava.py +866 -0
  867. vllm/model_executor/models/llava_next.py +586 -0
  868. vllm/model_executor/models/llava_next_video.py +471 -0
  869. vllm/model_executor/models/llava_onevision.py +956 -0
  870. vllm/model_executor/models/mamba.py +273 -0
  871. vllm/model_executor/models/mamba2.py +308 -0
  872. vllm/model_executor/models/mamba_cache.py +76 -0
  873. vllm/model_executor/models/medusa.py +219 -0
  874. vllm/model_executor/models/mimo.py +192 -0
  875. vllm/model_executor/models/mimo_mtp.py +285 -0
  876. vllm/model_executor/models/minicpm.py +592 -0
  877. vllm/model_executor/models/minicpm3.py +230 -0
  878. vllm/model_executor/models/minicpm_eagle.py +391 -0
  879. vllm/model_executor/models/minicpmo.py +759 -0
  880. vllm/model_executor/models/minicpmv.py +1287 -0
  881. vllm/model_executor/models/minimax_cache.py +36 -0
  882. vllm/model_executor/models/minimax_text_01.py +1301 -0
  883. vllm/model_executor/models/minimax_vl_01.py +364 -0
  884. vllm/model_executor/models/mistral3.py +604 -0
  885. vllm/model_executor/models/mixtral.py +488 -0
  886. vllm/model_executor/models/mixtral_quant.py +453 -0
  887. vllm/model_executor/models/mllama.py +1624 -0
  888. vllm/model_executor/models/mllama4.py +938 -0
  889. vllm/model_executor/models/mlp_speculator.py +206 -0
  890. vllm/model_executor/models/modernbert.py +331 -0
  891. vllm/model_executor/models/module_mapping.py +72 -0
  892. vllm/model_executor/models/molmo.py +1568 -0
  893. vllm/model_executor/models/moonvit.py +630 -0
  894. vllm/model_executor/models/mpt.py +331 -0
  895. vllm/model_executor/models/nemotron.py +508 -0
  896. vllm/model_executor/models/nemotron_h.py +573 -0
  897. vllm/model_executor/models/nemotron_nas.py +484 -0
  898. vllm/model_executor/models/nvlm_d.py +216 -0
  899. vllm/model_executor/models/olmo.py +389 -0
  900. vllm/model_executor/models/olmo2.py +414 -0
  901. vllm/model_executor/models/olmoe.py +468 -0
  902. vllm/model_executor/models/opt.py +412 -0
  903. vllm/model_executor/models/orion.py +349 -0
  904. vllm/model_executor/models/ovis.py +567 -0
  905. vllm/model_executor/models/paligemma.py +398 -0
  906. vllm/model_executor/models/persimmon.py +344 -0
  907. vllm/model_executor/models/phi.py +356 -0
  908. vllm/model_executor/models/phi3.py +19 -0
  909. vllm/model_executor/models/phi3_small.py +465 -0
  910. vllm/model_executor/models/phi3v.py +723 -0
  911. vllm/model_executor/models/phi4mm.py +1246 -0
  912. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  913. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  914. vllm/model_executor/models/phimoe.py +665 -0
  915. vllm/model_executor/models/pixtral.py +1316 -0
  916. vllm/model_executor/models/plamo2.py +738 -0
  917. vllm/model_executor/models/prithvi_geospatial_mae.py +232 -0
  918. vllm/model_executor/models/qwen.py +362 -0
  919. vllm/model_executor/models/qwen2.py +497 -0
  920. vllm/model_executor/models/qwen2_5_omni_thinker.py +904 -0
  921. vllm/model_executor/models/qwen2_5_vl.py +1166 -0
  922. vllm/model_executor/models/qwen2_audio.py +410 -0
  923. vllm/model_executor/models/qwen2_moe.py +540 -0
  924. vllm/model_executor/models/qwen2_rm.py +132 -0
  925. vllm/model_executor/models/qwen2_vl.py +1405 -0
  926. vllm/model_executor/models/qwen3.py +321 -0
  927. vllm/model_executor/models/qwen3_moe.py +535 -0
  928. vllm/model_executor/models/qwen_vl.py +785 -0
  929. vllm/model_executor/models/registry.py +622 -0
  930. vllm/model_executor/models/roberta.py +276 -0
  931. vllm/model_executor/models/siglip.py +524 -0
  932. vllm/model_executor/models/skyworkr1v.py +951 -0
  933. vllm/model_executor/models/smolvlm.py +52 -0
  934. vllm/model_executor/models/solar.py +506 -0
  935. vllm/model_executor/models/stablelm.py +343 -0
  936. vllm/model_executor/models/starcoder2.py +356 -0
  937. vllm/model_executor/models/tarsier.py +643 -0
  938. vllm/model_executor/models/telechat2.py +140 -0
  939. vllm/model_executor/models/teleflm.py +79 -0
  940. vllm/model_executor/models/transformers.py +508 -0
  941. vllm/model_executor/models/ultravox.py +656 -0
  942. vllm/model_executor/models/utils.py +731 -0
  943. vllm/model_executor/models/vision.py +147 -0
  944. vllm/model_executor/models/whisper.py +747 -0
  945. vllm/model_executor/models/zamba2.py +1009 -0
  946. vllm/model_executor/parameter.py +459 -0
  947. vllm/model_executor/pooling_metadata.py +72 -0
  948. vllm/model_executor/sampling_metadata.py +597 -0
  949. vllm/model_executor/utils.py +77 -0
  950. vllm/multimodal/__init__.py +33 -0
  951. vllm/multimodal/audio.py +106 -0
  952. vllm/multimodal/base.py +219 -0
  953. vllm/multimodal/hasher.py +118 -0
  954. vllm/multimodal/image.py +97 -0
  955. vllm/multimodal/inputs.py +876 -0
  956. vllm/multimodal/parse.py +461 -0
  957. vllm/multimodal/processing.py +1895 -0
  958. vllm/multimodal/profiling.py +258 -0
  959. vllm/multimodal/registry.py +331 -0
  960. vllm/multimodal/utils.py +436 -0
  961. vllm/multimodal/video.py +198 -0
  962. vllm/outputs.py +512 -0
  963. vllm/platforms/__init__.py +291 -0
  964. vllm/platforms/cpu.py +266 -0
  965. vllm/platforms/cuda.py +526 -0
  966. vllm/platforms/hpu.py +106 -0
  967. vllm/platforms/interface.py +538 -0
  968. vllm/platforms/neuron.py +150 -0
  969. vllm/platforms/rocm.py +435 -0
  970. vllm/platforms/tpu.py +216 -0
  971. vllm/platforms/xpu.py +156 -0
  972. vllm/plugins/__init__.py +94 -0
  973. vllm/plugins/lora_resolvers/README.md +15 -0
  974. vllm/plugins/lora_resolvers/__init__.py +0 -0
  975. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  976. vllm/pooling_params.py +54 -0
  977. vllm/profiler/__init__.py +0 -0
  978. vllm/profiler/layerwise_profile.py +375 -0
  979. vllm/profiler/utils.py +148 -0
  980. vllm/prompt_adapter/__init__.py +0 -0
  981. vllm/prompt_adapter/layers.py +83 -0
  982. vllm/prompt_adapter/models.py +358 -0
  983. vllm/prompt_adapter/request.py +37 -0
  984. vllm/prompt_adapter/utils.py +98 -0
  985. vllm/prompt_adapter/worker_manager.py +179 -0
  986. vllm/py.typed +2 -0
  987. vllm/reasoning/__init__.py +15 -0
  988. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  989. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  990. vllm/reasoning/granite_reasoning_parser.py +363 -0
  991. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  992. vllm/sampling_params.py +602 -0
  993. vllm/scalar_type.py +347 -0
  994. vllm/scripts.py +15 -0
  995. vllm/sequence.py +1568 -0
  996. vllm/spec_decode/__init__.py +0 -0
  997. vllm/spec_decode/batch_expansion.py +506 -0
  998. vllm/spec_decode/draft_model_runner.py +349 -0
  999. vllm/spec_decode/interfaces.py +99 -0
  1000. vllm/spec_decode/medusa_worker.py +138 -0
  1001. vllm/spec_decode/metrics.py +213 -0
  1002. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1003. vllm/spec_decode/mqa_scorer.py +160 -0
  1004. vllm/spec_decode/multi_step_worker.py +423 -0
  1005. vllm/spec_decode/ngram_worker.py +196 -0
  1006. vllm/spec_decode/proposer_worker_base.py +59 -0
  1007. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1008. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1009. vllm/spec_decode/target_model_runner.py +45 -0
  1010. vllm/spec_decode/top1_proposer.py +275 -0
  1011. vllm/spec_decode/util.py +277 -0
  1012. vllm/test_utils.py +130 -0
  1013. vllm/third_party/__init__.py +0 -0
  1014. vllm/third_party/pynvml.py +6140 -0
  1015. vllm/tracing.py +131 -0
  1016. vllm/transformers_utils/__init__.py +24 -0
  1017. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1018. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1019. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1020. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1021. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1022. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1023. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1024. vllm/transformers_utils/config.py +887 -0
  1025. vllm/transformers_utils/configs/__init__.py +61 -0
  1026. vllm/transformers_utils/configs/arctic.py +207 -0
  1027. vllm/transformers_utils/configs/chatglm.py +72 -0
  1028. vllm/transformers_utils/configs/cohere2.py +195 -0
  1029. vllm/transformers_utils/configs/dbrx.py +280 -0
  1030. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1031. vllm/transformers_utils/configs/eagle.py +85 -0
  1032. vllm/transformers_utils/configs/exaone.py +190 -0
  1033. vllm/transformers_utils/configs/falcon.py +90 -0
  1034. vllm/transformers_utils/configs/h2ovl.py +16 -0
  1035. vllm/transformers_utils/configs/internvl.py +54 -0
  1036. vllm/transformers_utils/configs/jais.py +238 -0
  1037. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1038. vllm/transformers_utils/configs/medusa.py +63 -0
  1039. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1040. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1041. vllm/transformers_utils/configs/mllama.py +31 -0
  1042. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1043. vllm/transformers_utils/configs/moonvit.py +33 -0
  1044. vllm/transformers_utils/configs/mpt.py +180 -0
  1045. vllm/transformers_utils/configs/nemotron.py +205 -0
  1046. vllm/transformers_utils/configs/nemotron_h.py +258 -0
  1047. vllm/transformers_utils/configs/nvlm_d.py +15 -0
  1048. vllm/transformers_utils/configs/ovis.py +184 -0
  1049. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1050. vllm/transformers_utils/configs/solar.py +247 -0
  1051. vllm/transformers_utils/configs/telechat2.py +64 -0
  1052. vllm/transformers_utils/configs/ultravox.py +108 -0
  1053. vllm/transformers_utils/detokenizer.py +168 -0
  1054. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1055. vllm/transformers_utils/processor.py +221 -0
  1056. vllm/transformers_utils/processors/__init__.py +8 -0
  1057. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1058. vllm/transformers_utils/processors/ovis.py +420 -0
  1059. vllm/transformers_utils/s3_utils.py +162 -0
  1060. vllm/transformers_utils/tokenizer.py +302 -0
  1061. vllm/transformers_utils/tokenizer_base.py +149 -0
  1062. vllm/transformers_utils/tokenizer_group.py +120 -0
  1063. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1064. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1065. vllm/transformers_utils/utils.py +99 -0
  1066. vllm/triton_utils/__init__.py +14 -0
  1067. vllm/triton_utils/importing.py +50 -0
  1068. vllm/usage/__init__.py +0 -0
  1069. vllm/usage/usage_lib.py +256 -0
  1070. vllm/utils.py +2910 -0
  1071. vllm/v1/__init__.py +0 -0
  1072. vllm/v1/attention/__init__.py +0 -0
  1073. vllm/v1/attention/backends/__init__.py +0 -0
  1074. vllm/v1/attention/backends/cpu_attn.py +163 -0
  1075. vllm/v1/attention/backends/flash_attn.py +869 -0
  1076. vllm/v1/attention/backends/flashinfer.py +651 -0
  1077. vllm/v1/attention/backends/flex_attention.py +477 -0
  1078. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1079. vllm/v1/attention/backends/mla/common.py +931 -0
  1080. vllm/v1/attention/backends/mla/cutlass_mla.py +97 -0
  1081. vllm/v1/attention/backends/mla/flashmla.py +152 -0
  1082. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +220 -0
  1083. vllm/v1/attention/backends/mla/triton_mla.py +120 -0
  1084. vllm/v1/attention/backends/pallas.py +240 -0
  1085. vllm/v1/attention/backends/triton_attn.py +285 -0
  1086. vllm/v1/attention/backends/utils.py +52 -0
  1087. vllm/v1/core/__init__.py +0 -0
  1088. vllm/v1/core/block_pool.py +349 -0
  1089. vllm/v1/core/encoder_cache_manager.py +150 -0
  1090. vllm/v1/core/kv_cache_coordinator.py +363 -0
  1091. vllm/v1/core/kv_cache_manager.py +392 -0
  1092. vllm/v1/core/kv_cache_utils.py +996 -0
  1093. vllm/v1/core/sched/__init__.py +0 -0
  1094. vllm/v1/core/sched/interface.py +150 -0
  1095. vllm/v1/core/sched/output.py +154 -0
  1096. vllm/v1/core/sched/scheduler.py +1044 -0
  1097. vllm/v1/core/sched/utils.py +23 -0
  1098. vllm/v1/core/single_type_kv_cache_manager.py +403 -0
  1099. vllm/v1/engine/__init__.py +173 -0
  1100. vllm/v1/engine/async_llm.py +558 -0
  1101. vllm/v1/engine/coordinator.py +253 -0
  1102. vllm/v1/engine/core.py +961 -0
  1103. vllm/v1/engine/core_client.py +1129 -0
  1104. vllm/v1/engine/detokenizer.py +261 -0
  1105. vllm/v1/engine/exceptions.py +17 -0
  1106. vllm/v1/engine/llm_engine.py +317 -0
  1107. vllm/v1/engine/logprobs.py +199 -0
  1108. vllm/v1/engine/mm_input_cache.py +91 -0
  1109. vllm/v1/engine/output_processor.py +428 -0
  1110. vllm/v1/engine/parallel_sampling.py +133 -0
  1111. vllm/v1/engine/processor.py +407 -0
  1112. vllm/v1/executor/__init__.py +0 -0
  1113. vllm/v1/executor/abstract.py +113 -0
  1114. vllm/v1/executor/multiproc_executor.py +537 -0
  1115. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1116. vllm/v1/kv_cache_interface.py +194 -0
  1117. vllm/v1/metrics/__init__.py +0 -0
  1118. vllm/v1/metrics/loggers.py +523 -0
  1119. vllm/v1/metrics/prometheus.py +82 -0
  1120. vllm/v1/metrics/ray_wrappers.py +131 -0
  1121. vllm/v1/metrics/reader.py +246 -0
  1122. vllm/v1/metrics/stats.py +239 -0
  1123. vllm/v1/outputs.py +116 -0
  1124. vllm/v1/request.py +193 -0
  1125. vllm/v1/sample/__init__.py +0 -0
  1126. vllm/v1/sample/metadata.py +44 -0
  1127. vllm/v1/sample/ops/__init__.py +0 -0
  1128. vllm/v1/sample/ops/bad_words.py +39 -0
  1129. vllm/v1/sample/ops/penalties.py +59 -0
  1130. vllm/v1/sample/ops/topk_topp_sampler.py +293 -0
  1131. vllm/v1/sample/rejection_sampler.py +631 -0
  1132. vllm/v1/sample/sampler.py +286 -0
  1133. vllm/v1/sample/tpu/__init__.py +0 -0
  1134. vllm/v1/sample/tpu/metadata.py +124 -0
  1135. vllm/v1/sample/tpu/sampler.py +145 -0
  1136. vllm/v1/serial_utils.py +315 -0
  1137. vllm/v1/spec_decode/__init__.py +0 -0
  1138. vllm/v1/spec_decode/eagle.py +432 -0
  1139. vllm/v1/spec_decode/medusa.py +62 -0
  1140. vllm/v1/spec_decode/metadata.py +62 -0
  1141. vllm/v1/spec_decode/metrics.py +178 -0
  1142. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1143. vllm/v1/spec_decode/utils.py +46 -0
  1144. vllm/v1/structured_output/__init__.py +222 -0
  1145. vllm/v1/structured_output/backend_guidance.py +245 -0
  1146. vllm/v1/structured_output/backend_types.py +134 -0
  1147. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1148. vllm/v1/structured_output/request.py +86 -0
  1149. vllm/v1/structured_output/utils.py +175 -0
  1150. vllm/v1/utils.py +743 -0
  1151. vllm/v1/worker/__init__.py +0 -0
  1152. vllm/v1/worker/block_table.py +142 -0
  1153. vllm/v1/worker/cpu_model_runner.py +86 -0
  1154. vllm/v1/worker/cpu_worker.py +152 -0
  1155. vllm/v1/worker/gpu_input_batch.py +681 -0
  1156. vllm/v1/worker/gpu_model_runner.py +2320 -0
  1157. vllm/v1/worker/gpu_worker.py +393 -0
  1158. vllm/v1/worker/lora_model_runner_mixin.py +173 -0
  1159. vllm/v1/worker/tpu_model_runner.py +1673 -0
  1160. vllm/v1/worker/tpu_worker.py +299 -0
  1161. vllm/v1/worker/utils.py +111 -0
  1162. vllm/v1/worker/worker_base.py +65 -0
  1163. vllm/version.py +41 -0
  1164. vllm/vllm_flash_attn/.gitkeep +0 -0
  1165. vllm/worker/__init__.py +0 -0
  1166. vllm/worker/cache_engine.py +145 -0
  1167. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1168. vllm/worker/cpu_model_runner.py +671 -0
  1169. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1170. vllm/worker/cpu_worker.py +450 -0
  1171. vllm/worker/enc_dec_model_runner.py +555 -0
  1172. vllm/worker/hpu_model_runner.py +2320 -0
  1173. vllm/worker/hpu_worker.py +484 -0
  1174. vllm/worker/model_runner.py +2178 -0
  1175. vllm/worker/model_runner_base.py +282 -0
  1176. vllm/worker/multi_step_hpu_worker.py +123 -0
  1177. vllm/worker/multi_step_model_runner.py +911 -0
  1178. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1179. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1180. vllm/worker/multi_step_tpu_worker.py +108 -0
  1181. vllm/worker/multi_step_worker.py +197 -0
  1182. vllm/worker/neuron_model_runner.py +460 -0
  1183. vllm/worker/neuron_worker.py +193 -0
  1184. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1185. vllm/worker/pooling_model_runner.py +211 -0
  1186. vllm/worker/tpu_model_runner.py +909 -0
  1187. vllm/worker/tpu_worker.py +337 -0
  1188. vllm/worker/utils.py +53 -0
  1189. vllm/worker/worker.py +577 -0
  1190. vllm/worker/worker_base.py +646 -0
  1191. vllm/worker/xpu_model_runner.py +606 -0
  1192. vllm/worker/xpu_worker.py +186 -0
  1193. vllm_cpu_amxbf16-0.9.1.dist-info/METADATA +305 -0
  1194. vllm_cpu_amxbf16-0.9.1.dist-info/RECORD +1197 -0
  1195. vllm_cpu_amxbf16-0.9.1.dist-info/WHEEL +5 -0
  1196. vllm_cpu_amxbf16-0.9.1.dist-info/entry_points.txt +5 -0
  1197. vllm_cpu_amxbf16-0.9.1.dist-info/top_level.txt +1 -0
vllm/core/scheduler.py ADDED
@@ -0,0 +1,2093 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import enum
5
+ import os
6
+ import random
7
+ import time
8
+ from collections import deque
9
+ from dataclasses import dataclass, field
10
+ from typing import Callable, Deque, Dict, Iterable, List, Optional
11
+ from typing import Sequence as GenericSequence
12
+ from typing import Set, Tuple, Union
13
+
14
+ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
15
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
16
+ from vllm.logger import init_logger
17
+ from vllm.lora.request import LoRARequest
18
+ from vllm.prompt_adapter.request import PromptAdapterRequest
19
+ from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
20
+ SequenceGroupBase, SequenceGroupMetadata,
21
+ SequenceGroupMetadataDelta, SequenceStage,
22
+ SequenceStatus)
23
+ from vllm.utils import Device, PyObjectCache
24
+
25
+ logger = init_logger(__name__)
26
+
27
+ # Test-only. If configured, decode is preempted with
28
+ # ARTIFICIAL_PREEMPTION_PROB% probability.
29
+ ENABLE_ARTIFICIAL_PREEMPT = bool(
30
+ os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa
31
+ ARTIFICIAL_PREEMPTION_PROB = 0.5
32
+ ARTIFICIAL_PREEMPTION_MAX_CNT = 500
33
+
34
+
35
+ class PreemptionMode(enum.Enum):
36
+ """Preemption modes.
37
+
38
+ 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
39
+ and swap them back in when the sequences are resumed.
40
+ 2. Recomputation: Discard the blocks of the preempted sequences and
41
+ recompute them when the sequences are resumed, treating the sequences as
42
+ new prompts.
43
+ """
44
+
45
+ SWAP = enum.auto()
46
+ RECOMPUTE = enum.auto()
47
+
48
+
49
+ @dataclass
50
+ class SchedulingBudget:
51
+ """The available slots for scheduling.
52
+
53
+ TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
54
+ budget update from the same request_id. It is because in normal scheduling
55
+ path, we update RUNNING num_seqs ahead of time, meaning it could be
56
+ updated more than once when scheduling RUNNING requests. Since this won't
57
+ happen if we only have chunked prefill scheduling, we can remove this
58
+ feature from the API when chunked prefill is enabled by default.
59
+ """
60
+
61
+ token_budget: int
62
+ max_num_seqs: int
63
+ _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
64
+ _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
65
+ # Number of cached tokens in the batch.
66
+ _num_cached_tokens: int = 0
67
+ # Number of actual non-cached tokens in the batch.
68
+ _num_batched_tokens: int = 0
69
+ _num_curr_seqs: int = 0
70
+
71
+ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
72
+ # We allow num_new_tokens to be 0 when the entire sequence has
73
+ # been cached.
74
+ assert num_new_tokens >= 0
75
+ assert num_new_seqs != 0
76
+ return (self.num_batched_tokens + num_new_tokens <= self.token_budget
77
+ and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
78
+
79
+ def remaining_token_budget(self):
80
+ return self.token_budget - self.num_batched_tokens
81
+
82
+ def add_num_batched_tokens(self,
83
+ req_id: str,
84
+ num_batched_tokens: int,
85
+ num_cached_tokens: int = 0):
86
+ if req_id in self._request_ids_num_batched_tokens:
87
+ return
88
+ assert num_cached_tokens >= 0
89
+ assert num_batched_tokens >= 0
90
+
91
+ self._request_ids_num_batched_tokens.add(req_id)
92
+ self._num_batched_tokens += num_batched_tokens
93
+ self._num_cached_tokens += num_cached_tokens
94
+
95
+ def subtract_num_batched_tokens(self, req_id: str,
96
+ num_batched_tokens: int):
97
+ if req_id in self._request_ids_num_batched_tokens:
98
+ self._request_ids_num_batched_tokens.remove(req_id)
99
+ self._num_batched_tokens -= num_batched_tokens
100
+
101
+ def add_num_seqs(self, req_id: str, num_curr_seqs: int):
102
+ if req_id in self._request_ids_num_curr_seqs:
103
+ return
104
+
105
+ self._request_ids_num_curr_seqs.add(req_id)
106
+ self._num_curr_seqs += num_curr_seqs
107
+
108
+ def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
109
+ if req_id in self._request_ids_num_curr_seqs:
110
+ self._request_ids_num_curr_seqs.remove(req_id)
111
+ self._num_curr_seqs -= num_curr_seqs
112
+
113
+ @property
114
+ def num_batched_tokens(self):
115
+ return self._num_batched_tokens
116
+
117
+ @property
118
+ def num_curr_seqs(self):
119
+ return self._num_curr_seqs
120
+
121
+ @property
122
+ def num_cached_tokens(self):
123
+ return self._num_cached_tokens
124
+
125
+
126
+ @dataclass
127
+ class ScheduledSequenceGroup:
128
+ # A sequence group that's scheduled.
129
+ seq_group: SequenceGroup
130
+ # The total chunk size (number of tokens) to process for next iteration.
131
+ # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
132
+ # chunked, it can be smaller than that.
133
+ token_chunk_size: int
134
+
135
+
136
+ @dataclass
137
+ class SchedulerOutputs:
138
+ """The scheduling decision made from a scheduler."""
139
+
140
+ # Scheduled sequence groups.
141
+ scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
142
+ # Number of prefill groups scheduled.
143
+ num_prefill_groups: int
144
+ # Total number of batched tokens.
145
+ num_batched_tokens: int
146
+ # Blocks to swap in. List of CPU -> GPU block number.
147
+ blocks_to_swap_in: List[Tuple[int, int]]
148
+ # Blocks to swap out. List of GPU -> CPU block number.
149
+ blocks_to_swap_out: List[Tuple[int, int]]
150
+ # Blocks to copy. Source to dest block.
151
+ blocks_to_copy: List[Tuple[int, int]]
152
+ # Sequence groups that are going to be ignored.
153
+ ignored_seq_groups: List[SequenceGroup]
154
+ # The number of slots for lookahead decoding.
155
+ num_lookahead_slots: int
156
+ # The number of requests in the running queue
157
+ running_queue_size: int
158
+ preempted: int
159
+
160
+ def __post_init__(self):
161
+ # Swap in and swap out should never happen at the same time.
162
+ assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
163
+
164
+ self.num_loras: int = len(self.lora_requests)
165
+ if self.num_loras > 0:
166
+ self._sort_by_lora_ids()
167
+
168
+ self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
169
+
170
+ def is_empty(self) -> bool:
171
+ # NOTE: We do not consider the ignored sequence groups.
172
+ return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
173
+ and not self.blocks_to_swap_out and not self.blocks_to_copy)
174
+
175
+ def _sort_by_lora_ids(self):
176
+ assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
177
+
178
+ def key_fn(group: ScheduledSequenceGroup):
179
+ key = (group.seq_group.lora_int_id, group.seq_group.request_id)
180
+ if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
181
+ # Sort sequence groups so that all prefills come before all
182
+ # decodes as required by chunked prefill.
183
+ return (not group.seq_group.is_prefill(), *key)
184
+ return key
185
+
186
+ self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
187
+ key=key_fn)
188
+
189
+ @property
190
+ def lora_requests(self) -> Set[LoRARequest]:
191
+ return {
192
+ g.seq_group.lora_request
193
+ for g in self.scheduled_seq_groups
194
+ if g.seq_group.lora_request is not None
195
+ }
196
+
197
+ @property
198
+ def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
199
+ return {
200
+ g.seq_group.prompt_adapter_request
201
+ for g in self.scheduled_seq_groups
202
+ if g.seq_group.prompt_adapter_request is not None
203
+ }
204
+
205
+
206
+ @dataclass
207
+ class SchedulerRunningOutputs:
208
+ """The requests that are scheduled from a running queue.
209
+
210
+ Could contain prefill (prefill that's chunked) or decodes. If there's not
211
+ enough memory, it can be preempted (for recompute) or swapped out.
212
+ """
213
+
214
+ # Selected sequences that are running and in a decoding phase.
215
+ decode_seq_groups: List[ScheduledSequenceGroup]
216
+ # Selected sequences that are running and in a prefill phase.
217
+ # I.e., it means the prefill has been chunked.
218
+ prefill_seq_groups: List[ScheduledSequenceGroup]
219
+ # The preempted sequences.
220
+ preempted: List[SequenceGroup]
221
+ # Sequences that are swapped out.
222
+ swapped_out: List[SequenceGroup]
223
+ # The blocks to swap out.
224
+ blocks_to_swap_out: List[Tuple[int, int]]
225
+ # The blocks to copy.
226
+ blocks_to_copy: List[Tuple[int, int]]
227
+ # The number of slots for lookahead decoding.
228
+ num_lookahead_slots: int
229
+
230
+ # Optimization for fast-access to seq_group lists
231
+ decode_seq_groups_list: List[SequenceGroup]
232
+ prefill_seq_groups_list: List[SequenceGroup]
233
+
234
+ @classmethod
235
+ def create_empty(cls) -> "SchedulerRunningOutputs":
236
+ return SchedulerRunningOutputs(
237
+ decode_seq_groups=[],
238
+ prefill_seq_groups=[],
239
+ preempted=[],
240
+ swapped_out=[],
241
+ blocks_to_swap_out=[],
242
+ blocks_to_copy=[],
243
+ num_lookahead_slots=0,
244
+ decode_seq_groups_list=[],
245
+ prefill_seq_groups_list=[],
246
+ )
247
+
248
+
249
+ @dataclass
250
+ class SchedulerSwappedInOutputs:
251
+ """The requests that are scheduled from a swap queue.
252
+
253
+ Could contain prefill (prefill that's chunked) or decodes.
254
+ """
255
+
256
+ # Selected sequences that are going to be swapped in and is in a
257
+ # decoding phase.
258
+ decode_seq_groups: List[ScheduledSequenceGroup]
259
+ # Selected sequences that are going to be swapped in and in a prefill
260
+ # phase. I.e., it means the prefill has been chunked.
261
+ prefill_seq_groups: List[ScheduledSequenceGroup]
262
+ # The blocks to swap in.
263
+ blocks_to_swap_in: List[Tuple[int, int]]
264
+ # The blocks to copy.
265
+ blocks_to_copy: List[Tuple[int, int]]
266
+ # The number of slots for lookahead decoding.
267
+ num_lookahead_slots: int
268
+ # Infeasible sequence groups.
269
+ infeasible_seq_groups: List[SequenceGroup]
270
+
271
+ @classmethod
272
+ def create_empty(cls) -> "SchedulerSwappedInOutputs":
273
+ return SchedulerSwappedInOutputs(
274
+ decode_seq_groups=[],
275
+ prefill_seq_groups=[],
276
+ blocks_to_swap_in=[],
277
+ blocks_to_copy=[],
278
+ num_lookahead_slots=0,
279
+ infeasible_seq_groups=[],
280
+ )
281
+
282
+
283
+ @dataclass
284
+ class SchedulerPrefillOutputs:
285
+ """The requests that are scheduled from a waiting queue.
286
+
287
+ Could contain a fresh prefill requests or preempted requests that need
288
+ to be recomputed from scratch.
289
+ """
290
+
291
+ # Selected sequences for prefill.
292
+ seq_groups: List[ScheduledSequenceGroup]
293
+ # Ignored sequence groups.
294
+ ignored_seq_groups: List[SequenceGroup]
295
+ num_lookahead_slots: int
296
+
297
+ @classmethod
298
+ def create_empty(cls) -> "SchedulerPrefillOutputs":
299
+ return SchedulerPrefillOutputs(
300
+ seq_groups=[],
301
+ ignored_seq_groups=[],
302
+ num_lookahead_slots=0,
303
+ )
304
+
305
+
306
+ def seq_group_metadata_builder():
307
+ return SequenceGroupMetadata(request_id="",
308
+ is_prompt=False,
309
+ seq_data={},
310
+ sampling_params=None,
311
+ block_tables={})
312
+
313
+
314
+ def scheduler_running_outputs_builder():
315
+ return SchedulerRunningOutputs(decode_seq_groups=[],
316
+ prefill_seq_groups=[],
317
+ preempted=[],
318
+ swapped_out=[],
319
+ blocks_to_swap_out=[],
320
+ blocks_to_copy=[],
321
+ num_lookahead_slots=0,
322
+ prefill_seq_groups_list=[],
323
+ decode_seq_groups_list=[])
324
+
325
+
326
+ def scheduled_seq_group_builder():
327
+ return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
328
+ token_chunk_size=0)
329
+ # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
330
+
331
+
332
+ @dataclass
333
+ class PartialPrefillMetadata:
334
+ """Holds information about the partial prefills that are currently running
335
+ during a single iteration of the Scheduler.
336
+ When chunked prefill is enabled, we allow a certain number of seqs to be
337
+ partially prefilled during each iteration. Having multiple partial prefills
338
+ in flight allows us to minimize TTFT and avoid decode starvation in cases
339
+ where a single sequence group with a very large prompt blocks the queue for
340
+ too many iterations.
341
+ The number of long prefill requests is limited so that smaller
342
+ requests may jump the queue in front of them and get to the decode
343
+ phase faster.
344
+ """
345
+
346
+ # A minimum bound on the total number of prefills to be scheduled during
347
+ # this iteration
348
+ schedulable_prefills: int
349
+
350
+ # The number of long prefill requests currently running
351
+ long_prefills: int
352
+
353
+ scheduler_config: SchedulerConfig
354
+
355
+ def can_schedule(self, seq_group: SequenceGroup) -> bool:
356
+ """When concurrent partial prefills are enabled,
357
+ we limit the number of long requests and only accept
358
+ shorter requests from the queue while running them
359
+ concurrently"""
360
+ return not (seq_group.first_seq.get_num_new_tokens()
361
+ > self.scheduler_config.long_prefill_token_threshold
362
+ and self.long_prefills
363
+ >= self.scheduler_config.max_long_partial_prefills
364
+ and self.scheduler_config.max_num_partial_prefills > 1)
365
+
366
+ def maybe_increment_partial_prefills(self,
367
+ seq_group: SequenceGroup) -> None:
368
+ # When a new prefill is scheduled, we need to know if it is a
369
+ # long request
370
+ if (seq_group.first_seq.get_num_new_tokens()
371
+ > self.scheduler_config.long_prefill_token_threshold):
372
+ self.long_prefills += 1
373
+
374
+ @classmethod
375
+ def from_queues(
376
+ cls,
377
+ running: Deque[SequenceGroup],
378
+ waiting: Deque[SequenceGroup],
379
+ scheduler_config: SchedulerConfig,
380
+ ) -> "PartialPrefillMetadata":
381
+ """Create a PartialPrefillMetadata object from the current state of
382
+ the scheduler's queues.
383
+ This accounts for the currently running prefill requests, and peeks into
384
+ the waiting queue to see if there are more prefills to potentially be
385
+ scheduled during this iteration."""
386
+ prefills = 0
387
+ long_prefills = 0
388
+
389
+ waiting_long_prefills = 0
390
+
391
+ for sg in running:
392
+ if sg.first_seq.data.stage == SequenceStage.PREFILL:
393
+ prefills += 1
394
+ if (sg.first_seq.get_num_new_tokens()
395
+ > scheduler_config.long_prefill_token_threshold):
396
+ long_prefills += 1
397
+
398
+ for sg in waiting:
399
+ # Don't bother looping through the rest of the queue if we know
400
+ # there are already at
401
+ # least max_partial_prefills requests to fill
402
+ if prefills >= scheduler_config.max_num_partial_prefills:
403
+ break
404
+
405
+ # Don't count long requests from the waiting queue if we aren't
406
+ # going to schedule them anyway
407
+ if (sg.first_seq.get_num_new_tokens()
408
+ > scheduler_config.long_prefill_token_threshold):
409
+ if (long_prefills + waiting_long_prefills
410
+ >= scheduler_config.max_long_partial_prefills):
411
+ continue
412
+ waiting_long_prefills += 1
413
+ prefills += 1
414
+
415
+ # NB: long_prefills and waiting_long_prefills are tracked separately.
416
+ # We don't account for the waiting requests here because we need to use
417
+ # this metadata to track how many have actually been scheduled.
418
+ return PartialPrefillMetadata(
419
+ schedulable_prefills=min(
420
+ prefills, scheduler_config.max_num_partial_prefills),
421
+ long_prefills=long_prefills,
422
+ scheduler_config=scheduler_config,
423
+ )
424
+
425
+
426
+ class Scheduler:
427
+
428
+ def __init__(
429
+ self,
430
+ scheduler_config: SchedulerConfig,
431
+ cache_config: CacheConfig,
432
+ lora_config: Optional[LoRAConfig],
433
+ pipeline_parallel_size: int = 1,
434
+ output_proc_callback: Optional[Callable] = None,
435
+ ) -> None:
436
+ self.scheduler_config = scheduler_config
437
+ self.cache_config = cache_config
438
+ # Note for LoRA scheduling: the current policy is extremely
439
+ # simple and NOT fair. It can lead to starvation of some
440
+ # LoRAs. This should be improved in the future.
441
+ self.lora_config = lora_config
442
+
443
+ version = "selfattn"
444
+ if (self.scheduler_config.runner_type == "pooling"
445
+ or self.cache_config.is_attention_free):
446
+ version = "placeholder"
447
+
448
+ BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
449
+ version)
450
+
451
+ num_gpu_blocks = cache_config.num_gpu_blocks
452
+ if num_gpu_blocks:
453
+ num_gpu_blocks //= pipeline_parallel_size
454
+
455
+ num_cpu_blocks = cache_config.num_cpu_blocks
456
+ if num_cpu_blocks:
457
+ num_cpu_blocks //= pipeline_parallel_size
458
+
459
+ # Create the block space manager.
460
+ self.block_manager = BlockSpaceManagerImpl(
461
+ block_size=self.cache_config.block_size,
462
+ num_gpu_blocks=num_gpu_blocks,
463
+ num_cpu_blocks=num_cpu_blocks,
464
+ sliding_window=self.cache_config.sliding_window,
465
+ enable_caching=self.cache_config.enable_prefix_caching,
466
+ )
467
+
468
+ # Sequence groups in the WAITING state.
469
+ # Contain new prefill or preempted requests.
470
+ self.waiting: Deque[SequenceGroup] = deque()
471
+ # Sequence groups in the RUNNING state.
472
+ # Contain decode requests.
473
+ self.running: Deque[SequenceGroup] = deque()
474
+ # Sequence groups in the SWAPPED state.
475
+ # Contain decode requests that are swapped out.
476
+ self.swapped: Deque[SequenceGroup] = deque()
477
+ # Sequence groups finished requests ids since last step iteration.
478
+ # It lets the model know that any state associated with these requests
479
+ # can and must be released after the current step.
480
+ # This is used to evict the finished requests from the Mamba cache.
481
+ self._finished_requests_ids: List[str] = list()
482
+ # Time at previous scheduling step
483
+ self.prev_time = 0.0
484
+ # Did we schedule a prompt at previous step?
485
+ self.prev_prompt = False
486
+ # Latency of the last prompt step
487
+ self.last_prompt_latency = 0.0
488
+ # preemption mode, RECOMPUTE or SWAP
489
+ self.user_specified_preemption_mode = scheduler_config.preemption_mode
490
+
491
+ # The following field is test-only. It is used to inject artificial
492
+ # preemption.
493
+ self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
494
+ self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
495
+ if self.enable_artificial_preemption
496
+ else 0)
497
+ self.num_cumulative_preemption: int = 0
498
+
499
+ # Used to cache python objects
500
+ self._seq_group_metadata_cache: List[PyObjectCache] = []
501
+ self._scheduler_running_outputs_cache: List[PyObjectCache] = []
502
+ self._scheduled_seq_group_cache: List[PyObjectCache] = []
503
+
504
+ # For async output processing, we need to swap cache buffers between
505
+ # iterations. I.e. since the output processing is lagged one step,
506
+ # we cannot reuse the cached objects immediately when the schedule()
507
+ # is called again, but only when schedule() is called the second time.
508
+ self.output_proc_callback = output_proc_callback
509
+ self.use_async_output_proc = self.output_proc_callback is not None
510
+ self.num_cache_iters = 2 if self.use_async_output_proc else 1
511
+
512
+ self.cache_id = 0
513
+ for i in range(self.num_cache_iters):
514
+ self._seq_group_metadata_cache.append(
515
+ PyObjectCache(seq_group_metadata_builder))
516
+ self._scheduler_running_outputs_cache.append(
517
+ PyObjectCache(scheduler_running_outputs_builder))
518
+ self._scheduled_seq_group_cache.append(
519
+ PyObjectCache(scheduled_seq_group_builder))
520
+
521
+ # For async postprocessor, the extra decode run cannot be done
522
+ # when the request reaches max_model_len. In this case, the request
523
+ # will be stopped during schedule() call and added to this stop list
524
+ # for processing and deallocation by the free_finished_seq_groups()
525
+ self._async_stopped: List[SequenceGroup] = []
526
+
527
+ # List with the chunk sizes to hand out to each sequence depending
528
+ # on how many partial prefills are running. This is slightly faster than
529
+ # running an integer division every time a prefill is scheduled.
530
+ # This splits the budget evenly among all prefills.
531
+ self.partial_prefill_budget_lookup_list = [0] * (
532
+ self.scheduler_config.max_num_partial_prefills + 1)
533
+ self.partial_prefill_budget_lookup_list[0] = (
534
+ scheduler_config.max_num_batched_tokens)
535
+ for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
536
+ self.partial_prefill_budget_lookup_list[i] = (
537
+ scheduler_config.max_num_batched_tokens // i)
538
+
539
+ @property
540
+ def next_cache_id(self):
541
+ return (self.cache_id + 1) % self.num_cache_iters
542
+
543
+ @property
544
+ def lora_enabled(self) -> bool:
545
+ return bool(self.lora_config)
546
+
547
+ @property
548
+ def num_decoding_tokens_per_seq(self) -> int:
549
+ """The number of new tokens."""
550
+ return 1
551
+
552
+ def add_seq_group(self, seq_group: SequenceGroup) -> None:
553
+ # Add sequence groups to the waiting queue.
554
+ self.waiting.append(seq_group)
555
+
556
+ def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
557
+ # Add sequence groups to the running queue.
558
+ # Only for testing purposes.
559
+ self.running.append(seq_group)
560
+
561
+ def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
562
+ # Add sequence groups to the swapped queue.
563
+ # Only for testing purposes.
564
+ self.swapped.append(seq_group)
565
+
566
+ def abort_seq_group(
567
+ self,
568
+ request_id: Union[str, Iterable[str]],
569
+ seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
570
+ ) -> None:
571
+ """Aborts a sequence group with the given ID.
572
+
573
+ Check if the sequence group with the given ID
574
+ is present in any of the state queue.
575
+ If present, remove the sequence group from the state queue.
576
+ Also, if any of the sequences in the sequence group is not finished,
577
+ free the sequence with status `FINISHED_ABORTED`.
578
+ Otherwise, do nothing.
579
+
580
+ Args:
581
+ request_id: The ID(s) of the sequence group to abort.
582
+ seq_id_to_seq_group: helper for groups with n>1
583
+ """
584
+ if isinstance(request_id, str):
585
+ request_id = (request_id, )
586
+ request_ids = set(request_id)
587
+ seq_id_to_seq_group = seq_id_to_seq_group or {}
588
+ for state_queue in [self.waiting, self.running, self.swapped]:
589
+ aborted_groups: List[SequenceGroup] = []
590
+ for seq_group in state_queue:
591
+ # When n>1, seq_group.request_id looks like
592
+ # foo_parallel_sample_0, while request_ids is just foo, and we
593
+ # should resolve it as real_request_id to match.
594
+ if seq_group.request_id in seq_id_to_seq_group:
595
+ real_request_id = seq_id_to_seq_group[
596
+ seq_group.request_id].group_id
597
+ else:
598
+ real_request_id = seq_group.request_id
599
+ if real_request_id in request_ids:
600
+ # Appending aborted group into pending list.
601
+ aborted_groups.append(seq_group)
602
+ # We can't remove real_request_id in request_ids here,
603
+ # because there may be other seq groups sharing the same
604
+ # real_request_id
605
+ for aborted_group in aborted_groups:
606
+ # Remove the sequence group from the state queue.
607
+ state_queue.remove(aborted_group)
608
+ # Remove the aborted request from the Mamba cache.
609
+ self._finished_requests_ids.append(aborted_group.request_id)
610
+ for seq in aborted_group.get_seqs():
611
+ if seq.is_finished():
612
+ continue
613
+ seq.status = SequenceStatus.FINISHED_ABORTED
614
+ self.free_seq(seq)
615
+ if aborted_group.request_id in seq_id_to_seq_group:
616
+ del seq_id_to_seq_group[aborted_group.request_id]
617
+
618
+ self._free_seq_group_cross_attn_blocks(aborted_group)
619
+
620
+ def _free_seq_group_cross_attn_blocks(
621
+ self,
622
+ seq_group: SequenceGroup,
623
+ ) -> None:
624
+ """
625
+ Free a sequence group from a cross-attention block table.
626
+ Has no effect on decoder-only models.
627
+ """
628
+ if seq_group.is_encoder_decoder():
629
+ self.block_manager.free_cross(seq_group)
630
+
631
+ def has_unfinished_seqs(self) -> bool:
632
+ return (len(self.waiting) != 0 or len(self.running) != 0
633
+ or len(self.swapped) != 0)
634
+
635
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
636
+ return self.block_manager.get_prefix_cache_hit_rate(device)
637
+
638
+ def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
639
+ return self.block_manager.reset_prefix_cache(device)
640
+
641
+ def get_num_unfinished_seq_groups(self) -> int:
642
+ return len(self.waiting) + len(self.running) + len(self.swapped)
643
+
644
+ def get_and_reset_finished_requests_ids(self) -> List[str]:
645
+ """Flushes the list of request ids of previously finished seq_groups."""
646
+ finished_requests_ids = self._finished_requests_ids
647
+ self._finished_requests_ids = list()
648
+ return finished_requests_ids
649
+
650
+ def _schedule_running(
651
+ self,
652
+ budget: SchedulingBudget,
653
+ curr_loras: Optional[Set[int]],
654
+ enable_chunking: bool = False,
655
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
656
+ ) -> SchedulerRunningOutputs:
657
+ """Schedule sequence groups that are running.
658
+
659
+ Running queue should include decode and chunked prefill requests.
660
+
661
+ Args:
662
+ budget: The scheduling budget. The argument is in-place updated
663
+ when any decodes are preempted.
664
+ curr_loras: Currently batched lora request ids. The argument is
665
+ in-place updated when any decodes are preempted.
666
+ enable_chunking: If True, seq group can be chunked and only a
667
+ chunked number of tokens are scheduled if
668
+ `budget.num_batched_tokens` has not enough capacity to schedule
669
+ all tokens.
670
+ partial_prefill_metadata: information about the partial prefills
671
+ that are currently running
672
+
673
+ Returns:
674
+ SchedulerRunningOutputs.
675
+ """
676
+ ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
677
+ self.cache_id].get_object()
678
+ ret.blocks_to_swap_out.clear()
679
+ ret.blocks_to_copy.clear()
680
+ ret.decode_seq_groups.clear()
681
+ ret.prefill_seq_groups.clear()
682
+ ret.preempted.clear()
683
+ ret.swapped_out.clear()
684
+
685
+ ret.num_lookahead_slots = self._get_num_lookahead_slots(
686
+ is_prefill=False, enable_chunking=enable_chunking)
687
+
688
+ ret.decode_seq_groups_list.clear()
689
+ ret.prefill_seq_groups_list.clear()
690
+
691
+ # Blocks that need to be swapped or copied before model execution.
692
+ blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
693
+ blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
694
+
695
+ decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
696
+ prefill_seq_groups: List[
697
+ ScheduledSequenceGroup] = ret.prefill_seq_groups
698
+ preempted: List[SequenceGroup] = ret.preempted
699
+ swapped_out: List[SequenceGroup] = ret.swapped_out
700
+
701
+ running_queue = self.running
702
+ assert len(self._async_stopped) == 0
703
+ while running_queue:
704
+ seq_group = running_queue[0]
705
+ # We discard the cached tokens info here because we don't need it
706
+ # for running sequence:
707
+ # 1. If a sequence is running with chunked prefill, the cached
708
+ # tokens info was already used for the first prefill.
709
+ # 2. If a sequence is running with non-chunked prefill, then
710
+ # there it's a decoding sequence, and the cached tokens info is
711
+ # irrelevant.
712
+ num_uncached_new_tokens, _ = \
713
+ self._get_num_new_uncached_and_cached_tokens(
714
+ seq_group,
715
+ SequenceStatus.RUNNING,
716
+ enable_chunking,
717
+ budget,
718
+ partial_prefill_metadata,
719
+ )
720
+
721
+ num_running_tokens = num_uncached_new_tokens
722
+ if num_running_tokens == 0:
723
+ # No budget => Stop
724
+ break
725
+
726
+ running_queue.popleft()
727
+
728
+ # With async postprocessor, an extra decode run is done
729
+ # to process the final tokens. The check below avoids this extra
730
+ # decode run when the model max len is reached, in order to avoid
731
+ # a memory overflow.
732
+ if (self.use_async_output_proc and seq_group.seqs[0].get_len()
733
+ > self.scheduler_config.max_model_len):
734
+ self._async_stopped.append(seq_group)
735
+ continue
736
+
737
+ # NOTE(woosuk): Preemption happens only when there is no available
738
+ # slot to keep all the sequence groups in the RUNNING state.
739
+ while not self._can_append_slots(seq_group, enable_chunking):
740
+ budget.subtract_num_batched_tokens(seq_group.request_id,
741
+ num_running_tokens)
742
+ num_running_seqs = seq_group.get_max_num_running_seqs()
743
+ budget.subtract_num_seqs(seq_group.request_id,
744
+ num_running_seqs)
745
+
746
+ if (curr_loras is not None and seq_group.lora_int_id > 0
747
+ and seq_group.lora_int_id in curr_loras):
748
+ curr_loras.remove(seq_group.lora_int_id)
749
+
750
+ # Determine victim sequence
751
+ cont_loop = True
752
+ if running_queue:
753
+ # Preempt the lowest-priority sequence group.
754
+ victim_seq_group = running_queue.pop()
755
+ else:
756
+ # No other sequence group can be preempted.
757
+ # Preempt the current sequence group.
758
+ # Note: This is also where we stop this loop
759
+ # (since there is nothing else to preempt)
760
+ victim_seq_group = seq_group
761
+ cont_loop = False
762
+
763
+ # With async postprocessor, before preempting a sequence
764
+ # we need to ensure it has no pending async postprocessor
765
+ do_preempt = True
766
+ if self.use_async_output_proc:
767
+ assert self.output_proc_callback is not None
768
+ self.output_proc_callback(
769
+ request_id=victim_seq_group.request_id)
770
+
771
+ # It may be that the async pending "victim_seq_group"
772
+ # becomes finished, in which case we simply free it.
773
+ if victim_seq_group.is_finished():
774
+ self._free_finished_seq_group(victim_seq_group)
775
+ do_preempt = False
776
+
777
+ # Do preemption
778
+ if do_preempt:
779
+ preempted_mode = self._preempt(victim_seq_group,
780
+ blocks_to_swap_out)
781
+ if preempted_mode == PreemptionMode.RECOMPUTE:
782
+ preempted.append(victim_seq_group)
783
+ else:
784
+ swapped_out.append(victim_seq_group)
785
+
786
+ if not cont_loop:
787
+ break
788
+ else:
789
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
790
+ is_prefill = seq_group.is_prefill()
791
+
792
+ scheduled_seq_group: ScheduledSequenceGroup = (
793
+ self._scheduled_seq_group_cache[
794
+ self.cache_id].get_object())
795
+ scheduled_seq_group.seq_group = seq_group
796
+ if is_prefill:
797
+ scheduled_seq_group.token_chunk_size = num_running_tokens
798
+ prefill_seq_groups.append(scheduled_seq_group)
799
+ ret.prefill_seq_groups_list.append(seq_group)
800
+ else:
801
+ scheduled_seq_group.token_chunk_size = 1
802
+ decode_seq_groups.append(scheduled_seq_group)
803
+ ret.decode_seq_groups_list.append(seq_group)
804
+
805
+ budget.add_num_batched_tokens(seq_group.request_id,
806
+ num_running_tokens)
807
+ # OPTIMIZATION: Note that get_max_num_running_seqs is
808
+ # expensive. For the default scheduling chase where
809
+ # enable_chunking is False, num_seqs are updated before running
810
+ # this method, so we don't have to update it again here.
811
+ if enable_chunking:
812
+ num_running_seqs = seq_group.get_max_num_running_seqs()
813
+ budget.add_num_seqs(seq_group.request_id, num_running_seqs)
814
+ if curr_loras is not None and seq_group.lora_int_id > 0:
815
+ curr_loras.add(seq_group.lora_int_id)
816
+
817
+ self._scheduler_running_outputs_cache[self.next_cache_id].reset()
818
+ self._scheduled_seq_group_cache[self.next_cache_id].reset()
819
+
820
+ return ret
821
+
822
+ def _schedule_swapped(
823
+ self,
824
+ budget: SchedulingBudget,
825
+ curr_loras: Optional[Set[int]],
826
+ enable_chunking: bool = False,
827
+ ) -> SchedulerSwappedInOutputs:
828
+ """Schedule sequence groups that are swapped out.
829
+
830
+ It schedules swapped requests as long as it fits `budget` and
831
+ curr_loras <= max_lora from the scheduling config. The input arguments
832
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
833
+
834
+ Args:
835
+ budget: The scheduling budget. The argument is in-place updated
836
+ when any requests are swapped in.
837
+ curr_loras: Currently batched lora request ids. The argument is
838
+ in-place updated when any requests are swapped in.
839
+ enable_chunking: If True, seq group can be chunked and only a
840
+ chunked number of tokens are scheduled if
841
+ `budget.num_batched_tokens` has not enough capacity to schedule
842
+ all tokens.
843
+
844
+ Returns:
845
+ SchedulerSwappedInOutputs.
846
+ """
847
+ # Blocks that need to be swapped or copied before model execution.
848
+ blocks_to_swap_in: List[Tuple[int, int]] = []
849
+ blocks_to_copy: List[Tuple[int, int]] = []
850
+ decode_seq_groups: List[ScheduledSequenceGroup] = []
851
+ prefill_seq_groups: List[ScheduledSequenceGroup] = []
852
+ infeasible_seq_groups: List[SequenceGroup] = []
853
+
854
+ swapped_queue = self.swapped
855
+
856
+ leftover_swapped: Deque[SequenceGroup] = deque()
857
+ while swapped_queue:
858
+ seq_group = swapped_queue[0]
859
+
860
+ # If the sequence group cannot be swapped in, stop.
861
+ is_prefill = seq_group.is_prefill()
862
+ alloc_status = self.block_manager.can_swap_in(
863
+ seq_group,
864
+ self._get_num_lookahead_slots(is_prefill, enable_chunking))
865
+ if alloc_status == AllocStatus.LATER:
866
+ break
867
+ elif alloc_status == AllocStatus.NEVER:
868
+ logger.warning(
869
+ "Failing the request %s because there's not enough kv "
870
+ "cache blocks to run the entire sequence.",
871
+ seq_group.request_id,
872
+ )
873
+ for seq in seq_group.get_seqs():
874
+ seq.status = SequenceStatus.FINISHED_IGNORED
875
+ infeasible_seq_groups.append(seq_group)
876
+ swapped_queue.popleft()
877
+ continue
878
+
879
+ lora_int_id = 0
880
+ if self.lora_enabled:
881
+ lora_int_id = seq_group.lora_int_id
882
+ assert curr_loras is not None
883
+ assert self.lora_config is not None
884
+ if (lora_int_id > 0 and (lora_int_id not in curr_loras)
885
+ and len(curr_loras) >= self.lora_config.max_loras):
886
+ # We don't have a space for another LoRA, so
887
+ # we ignore this request for now.
888
+ leftover_swapped.appendleft(seq_group)
889
+ swapped_queue.popleft()
890
+ continue
891
+
892
+ # The total number of sequences in the RUNNING state should not
893
+ # exceed the maximum number of sequences.
894
+ num_new_seqs = seq_group.get_max_num_running_seqs()
895
+ num_new_tokens_uncached, num_new_tokens_cached = (
896
+ self._get_num_new_uncached_and_cached_tokens(
897
+ seq_group, SequenceStatus.SWAPPED, enable_chunking,
898
+ budget))
899
+
900
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
901
+ num_new_tokens=num_new_tokens_uncached,
902
+ num_new_seqs=num_new_seqs,
903
+ ):
904
+ break
905
+
906
+ if lora_int_id > 0 and curr_loras is not None:
907
+ curr_loras.add(lora_int_id)
908
+ swapped_queue.popleft()
909
+ self._swap_in(seq_group, blocks_to_swap_in)
910
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
911
+ if is_prefill:
912
+ prefill_seq_groups.append(
913
+ ScheduledSequenceGroup(
914
+ seq_group,
915
+ token_chunk_size=num_new_tokens_uncached +
916
+ num_new_tokens_cached,
917
+ ))
918
+ else:
919
+ decode_seq_groups.append(
920
+ ScheduledSequenceGroup(seq_group, token_chunk_size=1))
921
+ budget.add_num_batched_tokens(
922
+ seq_group.request_id,
923
+ num_batched_tokens=num_new_tokens_uncached,
924
+ num_cached_tokens=num_new_tokens_cached,
925
+ )
926
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
927
+
928
+ swapped_queue.extendleft(leftover_swapped)
929
+
930
+ return SchedulerSwappedInOutputs(
931
+ decode_seq_groups=decode_seq_groups,
932
+ prefill_seq_groups=prefill_seq_groups,
933
+ blocks_to_swap_in=blocks_to_swap_in,
934
+ blocks_to_copy=blocks_to_copy,
935
+ num_lookahead_slots=self._get_num_lookahead_slots(
936
+ is_prefill=False, enable_chunking=enable_chunking),
937
+ infeasible_seq_groups=infeasible_seq_groups,
938
+ )
939
+
940
+ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
941
+ if (self.scheduler_config.chunked_prefill_enabled
942
+ and not self.scheduler_config.is_multi_step):
943
+ prompt_limit = self.scheduler_config.max_model_len
944
+ else:
945
+ prompt_limit = min(
946
+ self.scheduler_config.max_model_len,
947
+ self.scheduler_config.max_num_batched_tokens,
948
+ )
949
+
950
+ # Model is fine tuned with long context. Return the fine tuned max_len.
951
+ if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
952
+ assert prompt_limit <= seq_group.lora_request.long_lora_max_len
953
+ return seq_group.lora_request.long_lora_max_len
954
+ else:
955
+ return prompt_limit
956
+
957
+ def _get_priority(self,
958
+ seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
959
+ """Get the priority of the sequence group.
960
+ Highest preference to user-defined priority, followed by arrival time.
961
+ Args:
962
+ seq_group: The sequence group input.
963
+ Returns:
964
+ The priority of the sequence group.
965
+ """
966
+ return seq_group.priority, seq_group.arrival_time
967
+
968
+ def _schedule_priority_preemption(
969
+ self,
970
+ budget: SchedulingBudget,
971
+ ) -> int:
972
+ """Sorts waiting and running queue. Also, force preempt requests
973
+ from the running queue if their priority is lower.
974
+ Priority-based preemption is used with the priority policy.
975
+ Args:
976
+ budget: The scheduling budget. The argument is in-place updated
977
+ when any requests are scheduled.
978
+ Returns:
979
+ A count of priority-based preemptions.
980
+ """
981
+
982
+ waiting_queue = self.waiting
983
+
984
+ running_queue = deque(sorted(self.running, key=self._get_priority))
985
+
986
+ blocks_to_swap_out: List[Tuple[int, int]] = []
987
+ force_preemption_count = 0
988
+
989
+ if waiting_queue:
990
+ seq_group = waiting_queue.popleft()
991
+ num_new_seqs = seq_group.get_max_num_running_seqs()
992
+ num_new_tokens_uncached, _ = \
993
+ self._get_num_new_uncached_and_cached_tokens(
994
+ seq_group, SequenceStatus.WAITING, False, budget)
995
+
996
+ # Only preempt if priority inversion exists
997
+ while running_queue and self._get_priority(
998
+ running_queue[-1]) > self._get_priority(seq_group):
999
+ # Only preempt if waiting sequence cannot be allocated
1000
+ can_allocate = self.block_manager.can_allocate(seq_group)
1001
+ if (num_new_tokens_uncached > 0
1002
+ and can_allocate == AllocStatus.OK
1003
+ and budget.can_schedule(
1004
+ num_new_tokens=num_new_tokens_uncached,
1005
+ num_new_seqs=num_new_seqs,
1006
+ )):
1007
+ break
1008
+
1009
+ # Adjust budget to remove the victim sequence group
1010
+ vseq_group = running_queue.pop()
1011
+ num_running_tokens_uncached, _ = (
1012
+ self._get_num_new_uncached_and_cached_tokens(
1013
+ vseq_group, SequenceStatus.RUNNING, False, budget))
1014
+ budget.subtract_num_batched_tokens(
1015
+ vseq_group.request_id, num_running_tokens_uncached)
1016
+ num_running_seqs = vseq_group.get_max_num_running_seqs()
1017
+ budget.subtract_num_seqs(vseq_group.request_id,
1018
+ num_running_seqs)
1019
+
1020
+ # Preempt out the victim sequence group
1021
+ self._preempt(vseq_group, blocks_to_swap_out)
1022
+ waiting_queue.appendleft(vseq_group)
1023
+ force_preemption_count += 1
1024
+ # Put the sequence back into the waiting queue
1025
+ waiting_queue.appendleft(seq_group)
1026
+
1027
+ waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
1028
+
1029
+ self.waiting = waiting_queue
1030
+ self.running = running_queue
1031
+ return force_preemption_count
1032
+
1033
+ def _schedule_prefills(
1034
+ self,
1035
+ budget: SchedulingBudget,
1036
+ curr_loras: Optional[Set[int]],
1037
+ enable_chunking: bool = False,
1038
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1039
+ ) -> SchedulerPrefillOutputs:
1040
+ """Schedule sequence groups that are in prefill stage.
1041
+
1042
+ Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
1043
+ as a new prefill (that starts from beginning -> most recently generated
1044
+ tokens).
1045
+
1046
+ It schedules waiting requests as long as it fits `budget` and
1047
+ curr_loras <= max_lora from the scheduling config. The input arguments
1048
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
1049
+
1050
+ Args:
1051
+ budget: The scheduling budget. The argument is in-place updated
1052
+ when any requests are scheduled.
1053
+ curr_loras: Currently batched lora request ids. The argument is
1054
+ in-place updated when any requests are scheduled.
1055
+ enable_chunking: If True, seq group can be chunked and only a
1056
+ chunked number of tokens are scheduled if
1057
+ `budget.num_batched_tokens` has not enough capacity to schedule
1058
+ all tokens.
1059
+ partial_prefill_metadata: information about the partial prefills
1060
+ that are currently running
1061
+
1062
+ Returns:
1063
+ SchedulerPrefillOutputs.
1064
+ """
1065
+ if budget.remaining_token_budget() == 0:
1066
+ # Do nothing: Can't add any more prefill anyway
1067
+ return SchedulerPrefillOutputs(
1068
+ seq_groups=[],
1069
+ ignored_seq_groups=[],
1070
+ num_lookahead_slots=self._get_num_lookahead_slots(
1071
+ is_prefill=True, enable_chunking=enable_chunking),
1072
+ )
1073
+ ignored_seq_groups: List[SequenceGroup] = []
1074
+ seq_groups: List[ScheduledSequenceGroup] = []
1075
+ using_prompt_embeds: bool = False
1076
+
1077
+ waiting_queue = self.waiting
1078
+
1079
+ leftover_waiting_sequences: Deque[SequenceGroup] = deque()
1080
+ while self._passed_delay(time.time()) and waiting_queue:
1081
+ seq_group = waiting_queue[0]
1082
+
1083
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
1084
+ assert len(waiting_seqs) == 1, (
1085
+ "Waiting sequence group should have only one prompt "
1086
+ "sequence.")
1087
+ if (partial_prefill_metadata is not None
1088
+ and not partial_prefill_metadata.can_schedule(seq_group)):
1089
+ leftover_waiting_sequences.appendleft(seq_group)
1090
+ waiting_queue.popleft()
1091
+ continue
1092
+ num_new_tokens_uncached, num_new_tokens_cached = (
1093
+ self._get_num_new_uncached_and_cached_tokens(
1094
+ seq_group,
1095
+ SequenceStatus.WAITING,
1096
+ enable_chunking,
1097
+ budget,
1098
+ partial_prefill_metadata=partial_prefill_metadata,
1099
+ ))
1100
+ num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
1101
+
1102
+ if not enable_chunking:
1103
+ num_prompt_tokens = waiting_seqs[0].get_len()
1104
+ assert num_new_tokens == num_prompt_tokens
1105
+
1106
+ prompt_limit = self._get_prompt_limit(seq_group)
1107
+ if num_new_tokens > prompt_limit:
1108
+ logger.warning(
1109
+ "Input prompt (%d tokens) is too long"
1110
+ " and exceeds limit of %d",
1111
+ num_new_tokens,
1112
+ prompt_limit,
1113
+ )
1114
+ for seq in waiting_seqs:
1115
+ seq.status = SequenceStatus.FINISHED_IGNORED
1116
+ ignored_seq_groups.append(seq_group)
1117
+ waiting_queue.popleft()
1118
+ continue
1119
+
1120
+ num_lookahead_slots: int = 0
1121
+ if self.scheduler_config.is_multi_step and enable_chunking:
1122
+ num_lookahead_slots = self._get_num_lookahead_slots(
1123
+ True, enable_chunking)
1124
+
1125
+ # If the sequence group cannot be allocated, stop.
1126
+ can_allocate = self.block_manager.can_allocate(
1127
+ seq_group, num_lookahead_slots=num_lookahead_slots)
1128
+ if can_allocate == AllocStatus.LATER:
1129
+ break
1130
+ elif can_allocate == AllocStatus.NEVER:
1131
+ logger.warning(
1132
+ "Input prompt (%d tokens) + lookahead slots (%d) is "
1133
+ "too long and exceeds the capacity of block_manager",
1134
+ num_new_tokens,
1135
+ num_lookahead_slots,
1136
+ )
1137
+ for seq in waiting_seqs:
1138
+ seq.status = SequenceStatus.FINISHED_IGNORED
1139
+ ignored_seq_groups.append(seq_group)
1140
+ waiting_queue.popleft()
1141
+ continue
1142
+
1143
+ # We cannot mix sequence groups that use prompt embeds and
1144
+ # those that do not.
1145
+ if len(seq_groups) == 0:
1146
+ using_prompt_embeds = seq_group.uses_prompt_embeds()
1147
+ if using_prompt_embeds != seq_group.uses_prompt_embeds():
1148
+ leftover_waiting_sequences.appendleft(seq_group)
1149
+ waiting_queue.popleft()
1150
+ continue
1151
+
1152
+ lora_int_id = 0
1153
+ if self.lora_enabled:
1154
+ lora_int_id = seq_group.lora_int_id
1155
+ assert curr_loras is not None
1156
+ assert self.lora_config is not None
1157
+ if (self.lora_enabled and lora_int_id > 0
1158
+ and lora_int_id not in curr_loras
1159
+ and len(curr_loras) >= self.lora_config.max_loras):
1160
+ # We don't have a space for another LoRA, so
1161
+ # we ignore this request for now.
1162
+ leftover_waiting_sequences.appendleft(seq_group)
1163
+ waiting_queue.popleft()
1164
+ continue
1165
+
1166
+ if (budget.num_batched_tokens
1167
+ >= self.scheduler_config.max_num_batched_tokens):
1168
+ # We've reached the budget limit - since there might be
1169
+ # continuous prefills in the running queue, we should break
1170
+ # to avoid scheduling any new prefills.
1171
+ break
1172
+
1173
+ num_new_seqs = seq_group.get_max_num_running_seqs()
1174
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
1175
+ num_new_tokens=num_new_tokens_uncached,
1176
+ num_new_seqs=num_new_seqs,
1177
+ ):
1178
+ break
1179
+
1180
+ # Can schedule this request.
1181
+ if curr_loras is not None and lora_int_id > 0:
1182
+ curr_loras.add(lora_int_id)
1183
+ waiting_queue.popleft()
1184
+ self._allocate_and_set_running(seq_group)
1185
+
1186
+ if partial_prefill_metadata is not None:
1187
+ partial_prefill_metadata.maybe_increment_partial_prefills(
1188
+ seq_group)
1189
+
1190
+ if enable_chunking and self.scheduler_config.is_multi_step:
1191
+ blocks_to_copy: List[Tuple[int, int]] = []
1192
+ # init_multi_step_from_lookahead_slots happens in append_slots
1193
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
1194
+ # This assert will trip when a copy-on-write happens. This is
1195
+ # not a concern as the very first sequence-group block
1196
+ # allocation happens above. Still, we have the assert to
1197
+ # catch any edge-cases.
1198
+ assert not blocks_to_copy
1199
+ else:
1200
+ seq_group.init_multi_step_from_lookahead_slots(
1201
+ num_lookahead_slots,
1202
+ num_scheduler_steps=self.scheduler_config.
1203
+ num_scheduler_steps,
1204
+ is_multi_step=self.scheduler_config.is_multi_step,
1205
+ enable_chunking=enable_chunking,
1206
+ )
1207
+
1208
+ seq_groups.append(
1209
+ ScheduledSequenceGroup(seq_group=seq_group,
1210
+ token_chunk_size=num_new_tokens))
1211
+ budget.add_num_batched_tokens(
1212
+ seq_group.request_id,
1213
+ num_batched_tokens=num_new_tokens_uncached,
1214
+ num_cached_tokens=num_new_tokens_cached,
1215
+ )
1216
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
1217
+
1218
+ # Queue requests that couldn't be scheduled.
1219
+ waiting_queue.extendleft(leftover_waiting_sequences)
1220
+ if len(seq_groups) > 0:
1221
+ self.prev_prompt = True
1222
+
1223
+ return SchedulerPrefillOutputs(
1224
+ seq_groups=seq_groups,
1225
+ ignored_seq_groups=ignored_seq_groups,
1226
+ num_lookahead_slots=self._get_num_lookahead_slots(
1227
+ is_prefill=True, enable_chunking=enable_chunking),
1228
+ )
1229
+
1230
+ def _schedule_default(self) -> SchedulerOutputs:
1231
+ """Schedule queued requests.
1232
+
1233
+ The current policy is designed to optimize the throughput. First,
1234
+ it batches as many prefill requests as possible. And it schedules
1235
+ decodes. If there's a pressure on GPU memory, decode requests can
1236
+ be swapped or preempted.
1237
+ """
1238
+ # Include running requests to the budget.
1239
+ budget = SchedulingBudget(
1240
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1241
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1242
+ )
1243
+ # Make sure we include num running seqs before scheduling prefill,
1244
+ # so that we don't schedule beyond max_num_seqs for prefill.
1245
+ for seq_group in self.running:
1246
+ budget.add_num_seqs(seq_group.request_id,
1247
+ seq_group.get_max_num_running_seqs())
1248
+ curr_loras = (set(
1249
+ seq_group.lora_int_id for seq_group in self.running
1250
+ if seq_group.lora_int_id > 0) if self.lora_enabled else None)
1251
+
1252
+ prefills = SchedulerPrefillOutputs.create_empty()
1253
+ running_scheduled = SchedulerRunningOutputs.create_empty()
1254
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1255
+
1256
+ # If any requests are swapped, prioritized swapped requests.
1257
+ if not self.swapped:
1258
+ prefills = self._schedule_prefills(budget,
1259
+ curr_loras,
1260
+ enable_chunking=False)
1261
+
1262
+ if len(prefills.seq_groups
1263
+ ) == 0 and self.scheduler_config.policy == "priority":
1264
+ self._schedule_priority_preemption(budget)
1265
+
1266
+ # Don't schedule decodes if prefills are scheduled.
1267
+ # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
1268
+ # only contains decode requests, not chunked prefills.
1269
+ if len(prefills.seq_groups) == 0:
1270
+ running_scheduled = self._schedule_running(budget,
1271
+ curr_loras,
1272
+ enable_chunking=False)
1273
+
1274
+ # If any sequence group is preempted, do not swap in any sequence
1275
+ # group. because it means there's no slot for new running requests.
1276
+ if (len(running_scheduled.preempted) +
1277
+ len(running_scheduled.swapped_out) == 0):
1278
+ swapped_in = \
1279
+ self._schedule_swapped(budget, curr_loras)
1280
+
1281
+ assert (budget.num_batched_tokens
1282
+ <= self.scheduler_config.max_num_batched_tokens)
1283
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1284
+
1285
+ # Update waiting requests.
1286
+ self.waiting.extendleft(running_scheduled.preempted)
1287
+ # Update new running requests.
1288
+ if len(prefills.seq_groups) > 0:
1289
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1290
+
1291
+ self.running.extend(running_scheduled.decode_seq_groups_list)
1292
+
1293
+ if len(swapped_in.decode_seq_groups) > 0:
1294
+ self.running.extend(
1295
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1296
+
1297
+ # Update swapped requests.
1298
+ self.swapped.extend(running_scheduled.swapped_out)
1299
+ preempted = len(running_scheduled.preempted) + len(
1300
+ running_scheduled.swapped_out)
1301
+
1302
+ # There should be no prefill from running queue because this policy
1303
+ # doesn't allow chunked prefills.
1304
+ assert len(running_scheduled.prefill_seq_groups) == 0
1305
+ assert len(swapped_in.prefill_seq_groups) == 0
1306
+
1307
+ # Merge lists
1308
+ num_prefill_groups = len(prefills.seq_groups)
1309
+ ignored_seq_groups_for_embeds = list[SequenceGroup]()
1310
+ if num_prefill_groups > 0:
1311
+ scheduled_seq_groups = prefills.seq_groups
1312
+ scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
1313
+ ignored_seq_groups_for_embeds.clear()
1314
+ else:
1315
+ scheduled_seq_groups = running_scheduled.decode_seq_groups
1316
+ if len(scheduled_seq_groups) > 0:
1317
+ using_prompt_embeds = scheduled_seq_groups[
1318
+ 0].seq_group.uses_prompt_embeds()
1319
+ ignored_seq_groups_for_embeds.clear()
1320
+ indices_ignored = list[int]()
1321
+ for i, schedule_seq_group in enumerate(scheduled_seq_groups):
1322
+ if using_prompt_embeds !=\
1323
+ schedule_seq_group.seq_group.uses_prompt_embeds():
1324
+ ignored_seq_groups_for_embeds.append(
1325
+ schedule_seq_group.seq_group)
1326
+ indices_ignored.append(i)
1327
+ if len(ignored_seq_groups_for_embeds) > 0:
1328
+ scheduled_seq_groups = [
1329
+ group for i, group in enumerate(scheduled_seq_groups)
1330
+ if i not in indices_ignored
1331
+ ]
1332
+ else:
1333
+ ignored_seq_groups_for_embeds.clear()
1334
+
1335
+ scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
1336
+
1337
+ blocks_to_copy = running_scheduled.blocks_to_copy
1338
+ blocks_to_copy.extend(swapped_in.blocks_to_copy)
1339
+
1340
+ ignored_seq_groups = prefills.ignored_seq_groups
1341
+ ignored_seq_groups.extend(ignored_seq_groups_for_embeds)
1342
+ ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
1343
+
1344
+ return SchedulerOutputs(
1345
+ scheduled_seq_groups=scheduled_seq_groups,
1346
+ num_prefill_groups=num_prefill_groups,
1347
+ num_batched_tokens=budget.num_batched_tokens +
1348
+ budget.num_cached_tokens,
1349
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1350
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1351
+ blocks_to_copy=blocks_to_copy,
1352
+ ignored_seq_groups=ignored_seq_groups,
1353
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
1354
+ running_queue_size=len(self.running),
1355
+ preempted=preempted,
1356
+ )
1357
+
1358
+ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
1359
+ """Schedule queued requests.
1360
+
1361
+ Chunked prefill allows to chunk prefill requests, batch them together
1362
+ with decode requests. This policy 1. schedule as many decoding requests
1363
+ as possible. 2. schedule chunked prefill requests that are not
1364
+ finished. 3. schedule swapped request. 4. schedule new prefill
1365
+ requests.
1366
+
1367
+ The policy can sustain the high GPU utilization because it can put
1368
+ prefill and decodes requests to the same batch, while it improves
1369
+ inter token latency because decodes requests don't need to be blocked
1370
+ by prefill requests.
1371
+ """
1372
+ budget = SchedulingBudget(
1373
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1374
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1375
+ )
1376
+ curr_loras: Set[int] = set()
1377
+
1378
+ prefills = SchedulerPrefillOutputs.create_empty()
1379
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1380
+
1381
+ # Create partial prefill metadata
1382
+ partial_prefill_metadata = PartialPrefillMetadata.from_queues(
1383
+ running=self.running,
1384
+ waiting=self.waiting,
1385
+ scheduler_config=self.scheduler_config,
1386
+ )
1387
+
1388
+ # Decoding should be always scheduled first by fcfs.
1389
+ running_scheduled = self._schedule_running(
1390
+ budget,
1391
+ curr_loras,
1392
+ enable_chunking=True,
1393
+ partial_prefill_metadata=partial_prefill_metadata,
1394
+ )
1395
+
1396
+ # Schedule swapped out requests.
1397
+ # If preemption happens, it means we don't have space for swap-in.
1398
+ if len(running_scheduled.preempted) + len(
1399
+ running_scheduled.swapped_out) == 0:
1400
+ swapped_in = self._schedule_swapped(budget, curr_loras)
1401
+
1402
+ prefills = self._schedule_prefills(
1403
+ budget,
1404
+ curr_loras,
1405
+ enable_chunking=True,
1406
+ partial_prefill_metadata=partial_prefill_metadata,
1407
+ )
1408
+
1409
+ assert (budget.num_batched_tokens
1410
+ <= self.scheduler_config.max_num_batched_tokens)
1411
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1412
+
1413
+ # Update waiting requests.
1414
+ self.waiting.extendleft(running_scheduled.preempted)
1415
+
1416
+ # Update new running requests.
1417
+ # By default, vLLM scheduler prioritizes prefills.
1418
+ # Once chunked prefill is enabled,
1419
+ # the policy is changed to prioritize decode requests.
1420
+ self.running.extend(
1421
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1422
+ self.running.extend(
1423
+ [s.seq_group for s in swapped_in.prefill_seq_groups])
1424
+ self.running.extend(
1425
+ [s.seq_group for s in running_scheduled.decode_seq_groups])
1426
+ # Because multiple prefills may be running concurrently, we need to
1427
+ # make sure that prefills which are scheduled to finish are listed
1428
+ # before those that won't. This is so that on the next scheduling
1429
+ # iteration when they have transitioned to the decode stage, they are
1430
+ # properly prioritized over sequences that are still in the prefill
1431
+ # stage.
1432
+ self.running.extend(
1433
+ self._order_finishing_prefills_first(
1434
+ running_scheduled.prefill_seq_groups))
1435
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1436
+
1437
+ # Update swapped requests.
1438
+ self.swapped.extend(running_scheduled.swapped_out)
1439
+ # Put prefills first due to Attention backend ordering assumption.
1440
+ scheduled_seq_groups = (prefills.seq_groups +
1441
+ running_scheduled.prefill_seq_groups +
1442
+ swapped_in.prefill_seq_groups +
1443
+ running_scheduled.decode_seq_groups +
1444
+ swapped_in.decode_seq_groups)
1445
+ num_prefill_groups = (len(prefills.seq_groups) +
1446
+ len(swapped_in.prefill_seq_groups) +
1447
+ len(running_scheduled.prefill_seq_groups))
1448
+ # If all prompts, then we set num_lookahead_slots to 0
1449
+ # this allows us to go through the `no_spec` path in
1450
+ # `spec_decode_worker.py`
1451
+ all_prefills = len(scheduled_seq_groups) == num_prefill_groups
1452
+ num_lookahead_slots = (0 if
1453
+ (all_prefills
1454
+ and not self.scheduler_config.is_multi_step)
1455
+ else running_scheduled.num_lookahead_slots)
1456
+ return SchedulerOutputs(
1457
+ scheduled_seq_groups=scheduled_seq_groups,
1458
+ num_prefill_groups=num_prefill_groups,
1459
+ num_batched_tokens=budget.num_batched_tokens +
1460
+ budget.num_cached_tokens,
1461
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1462
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1463
+ blocks_to_copy=running_scheduled.blocks_to_copy +
1464
+ swapped_in.blocks_to_copy,
1465
+ ignored_seq_groups=prefills.ignored_seq_groups +
1466
+ swapped_in.infeasible_seq_groups,
1467
+ num_lookahead_slots=num_lookahead_slots,
1468
+ running_queue_size=len(self.running),
1469
+ preempted=(len(running_scheduled.preempted) +
1470
+ len(running_scheduled.swapped_out)),
1471
+ )
1472
+
1473
+ def _order_finishing_prefills_first(
1474
+ self, scheduled_prefill_seqs: List[ScheduledSequenceGroup]
1475
+ ) -> List[SequenceGroup]:
1476
+ """Returns a list of prefilling SequenceGroups where sequences that are
1477
+ scheduled to finish prefilling are listed first"""
1478
+ finishing = [
1479
+ s.seq_group for s in scheduled_prefill_seqs
1480
+ if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size
1481
+ ]
1482
+ not_finishing = [
1483
+ s.seq_group for s in scheduled_prefill_seqs
1484
+ if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size
1485
+ ]
1486
+ return finishing + not_finishing
1487
+
1488
+ def _schedule(self) -> SchedulerOutputs:
1489
+ """Schedule queued requests."""
1490
+ if self.scheduler_config.chunked_prefill_enabled:
1491
+ return self._schedule_chunked_prefill()
1492
+ else:
1493
+ return self._schedule_default()
1494
+
1495
+ def _can_append_slots(self, seq_group: SequenceGroup,
1496
+ enable_chunking: bool) -> bool:
1497
+ """Determine whether or not we have enough space in the KV cache to
1498
+ continue generation of the sequence group.
1499
+ """
1500
+ # It is True only for testing case to trigger artificial preemption.
1501
+ if (self.enable_artificial_preemption
1502
+ and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
1503
+ and self.artificial_preempt_cnt > 0):
1504
+ self.artificial_preempt_cnt -= 1
1505
+ return False
1506
+
1507
+ is_prefill = seq_group.is_prefill()
1508
+ num_lookahead_slots = self._get_num_lookahead_slots(
1509
+ is_prefill, enable_chunking)
1510
+
1511
+ if is_prefill and num_lookahead_slots > 0:
1512
+ # Appending prefill slots only happens multi-step and
1513
+ # chunked-prefill are enabled together.
1514
+ assert self.scheduler_config.is_multi_step and enable_chunking
1515
+
1516
+ return self.block_manager.can_append_slots(
1517
+ seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
1518
+
1519
+ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
1520
+ # async_output_proc is allowed only when we have a single sequence
1521
+ # in the sequence group
1522
+ no_single_seq = seq_group.sampling_params is None or (
1523
+ seq_group.sampling_params.n == 1)
1524
+ return no_single_seq
1525
+
1526
+ def schedule(
1527
+ self
1528
+ ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
1529
+ # Schedule sequence groups.
1530
+ # This function call changes the internal states of the scheduler
1531
+ # such as self.running, self.swapped, and self.waiting.
1532
+ scheduler_start_time = time.perf_counter()
1533
+
1534
+ scheduler_outputs: SchedulerOutputs = self._schedule()
1535
+ now = time.time()
1536
+
1537
+ if not self.cache_config.enable_prefix_caching:
1538
+ common_computed_block_nums = []
1539
+
1540
+ allow_async_output_proc: bool = self.use_async_output_proc
1541
+
1542
+ # Create input data structures.
1543
+ seq_group_metadata_list: List[SequenceGroupMetadata] = []
1544
+ for i, scheduled_seq_group in enumerate(
1545
+ scheduler_outputs.scheduled_seq_groups):
1546
+ seq_group = scheduled_seq_group.seq_group
1547
+ token_chunk_size = scheduled_seq_group.token_chunk_size
1548
+ seq_group.maybe_set_first_scheduled_time(now)
1549
+
1550
+ seq_group_metadata = self._seq_group_metadata_cache[
1551
+ self.cache_id].get_object()
1552
+ seq_group_metadata.seq_data.clear()
1553
+ seq_group_metadata.block_tables.clear()
1554
+
1555
+ # seq_id -> SequenceData
1556
+ seq_data: Dict[int, SequenceData] = {}
1557
+ # seq_id -> physical block numbers
1558
+ block_tables: Dict[int, List[int]] = {}
1559
+
1560
+ if seq_group.is_encoder_decoder():
1561
+ # Encoder associated with SequenceGroup
1562
+ encoder_seq = seq_group.get_encoder_seq()
1563
+ assert encoder_seq is not None
1564
+ encoder_seq_data = encoder_seq.data
1565
+ # Block table for cross-attention
1566
+ # Also managed at SequenceGroup level
1567
+ cross_block_table = self.block_manager.get_cross_block_table(
1568
+ seq_group)
1569
+ else:
1570
+ encoder_seq_data = None
1571
+ cross_block_table = None
1572
+
1573
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1574
+ seq_id = seq.seq_id
1575
+ seq_data[seq_id] = seq.data
1576
+ block_tables[seq_id] = self.block_manager.get_block_table(seq)
1577
+ self.block_manager.access_all_blocks_in_seq(seq, now)
1578
+
1579
+ if self.cache_config.enable_prefix_caching:
1580
+ common_computed_block_nums = (
1581
+ self.block_manager.get_common_computed_block_ids(
1582
+ seq_group.get_seqs(status=SequenceStatus.RUNNING)))
1583
+
1584
+ do_sample = True
1585
+ is_prompt = seq_group.is_prefill()
1586
+ # We should send the metadata to workers when the first prefill
1587
+ # is sent. Subsequent requests could be chunked prefill or decode.
1588
+ is_first_prefill = False
1589
+ if is_prompt:
1590
+ seqs = seq_group.get_seqs()
1591
+ # Prefill has only 1 sequence.
1592
+ assert len(seqs) == 1
1593
+ num_computed_tokens = seqs[0].data.get_num_computed_tokens()
1594
+ is_first_prefill = num_computed_tokens == 0
1595
+ # In the next iteration, all prompt tokens are not computed.
1596
+ # It means the prefill is chunked, and we don't need sampling.
1597
+ # NOTE: We use get_len instead of get_prompt_len because when
1598
+ # a sequence is preempted, prefill includes previous generated
1599
+ # output tokens.
1600
+ if (token_chunk_size + num_computed_tokens
1601
+ < seqs[0].data.get_len()):
1602
+ do_sample = False
1603
+
1604
+ # It assumes the scheduled_seq_groups is ordered by
1605
+ # prefill < decoding.
1606
+ if is_first_prefill or not self.scheduler_config.send_delta_data:
1607
+ seq_group_metadata = SequenceGroupMetadata(
1608
+ request_id=seq_group.request_id,
1609
+ is_prompt=is_prompt,
1610
+ seq_data=seq_data,
1611
+ sampling_params=seq_group.sampling_params,
1612
+ block_tables=block_tables,
1613
+ do_sample=do_sample,
1614
+ pooling_params=seq_group.pooling_params,
1615
+ token_chunk_size=token_chunk_size,
1616
+ lora_request=seq_group.lora_request,
1617
+ computed_block_nums=common_computed_block_nums,
1618
+ encoder_seq_data=encoder_seq_data,
1619
+ cross_block_table=cross_block_table,
1620
+ state=seq_group.state,
1621
+ token_type_ids=seq_group.token_type_ids,
1622
+ # `multi_modal_data` will only be present for the 1st comm
1623
+ # between engine and worker.
1624
+ # the subsequent comms can still use delta, but
1625
+ # `multi_modal_data` will be None.
1626
+ multi_modal_data=(seq_group.multi_modal_data
1627
+ if scheduler_outputs.num_prefill_groups
1628
+ > 0 else None),
1629
+ multi_modal_placeholders=(
1630
+ seq_group.multi_modal_placeholders
1631
+ if scheduler_outputs.num_prefill_groups > 0 else None),
1632
+ prompt_adapter_request=seq_group.prompt_adapter_request,
1633
+ )
1634
+ else:
1635
+ # When SPMD mode is enabled, we only send delta data except for
1636
+ # the first request to reduce serialization cost.
1637
+ seq_data_delta = {}
1638
+ for id, data in seq_data.items():
1639
+ seq_data_delta[id] = data.get_delta_and_reset()
1640
+ seq_group_metadata = SequenceGroupMetadataDelta(
1641
+ seq_data_delta,
1642
+ seq_group.request_id,
1643
+ block_tables,
1644
+ is_prompt,
1645
+ do_sample=do_sample,
1646
+ token_chunk_size=token_chunk_size,
1647
+ computed_block_nums=common_computed_block_nums,
1648
+ )
1649
+ seq_group_metadata_list.append(seq_group_metadata)
1650
+
1651
+ if allow_async_output_proc:
1652
+ allow_async_output_proc = self._allow_async_output_proc(
1653
+ seq_group)
1654
+
1655
+ # Now that the batch has been created, we can assume all blocks in the
1656
+ # batch will have been computed before the next scheduling invocation.
1657
+ # This is because the engine assumes that a failure in model execution
1658
+ # will crash the vLLM instance / will not retry.
1659
+ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
1660
+ self.block_manager.mark_blocks_as_computed(
1661
+ scheduled_seq_group.seq_group,
1662
+ scheduled_seq_group.token_chunk_size)
1663
+
1664
+ self._seq_group_metadata_cache[self.next_cache_id].reset()
1665
+
1666
+ scheduler_time = time.perf_counter() - scheduler_start_time
1667
+ # Add this to scheduler time to all the sequences that are currently
1668
+ # running. This will help estimate if the scheduler is a significant
1669
+ # component in the e2e latency.
1670
+ for seq_group in self.running:
1671
+ if seq_group is not None and seq_group.metrics is not None:
1672
+ if seq_group.metrics.scheduler_time is not None:
1673
+ seq_group.metrics.scheduler_time += scheduler_time
1674
+ else:
1675
+ seq_group.metrics.scheduler_time = scheduler_time
1676
+
1677
+ # Move to next cache (if exists)
1678
+ self.cache_id = self.next_cache_id
1679
+
1680
+ # Return results
1681
+ return (seq_group_metadata_list, scheduler_outputs,
1682
+ allow_async_output_proc)
1683
+
1684
+ def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
1685
+ self.block_manager.fork(parent_seq, child_seq)
1686
+
1687
+ def free_seq(self, seq: Sequence) -> None:
1688
+ """Free a sequence from a block table."""
1689
+ self.block_manager.free(seq)
1690
+
1691
+ def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
1692
+ """Free finished seqs in a sequence group."""
1693
+ for seq in seq_group.get_seqs():
1694
+ if seq.is_finished():
1695
+ self.free_seq(seq)
1696
+
1697
+ def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
1698
+ if seq_group.is_finished():
1699
+ # Free cross-attention block table, if it exists
1700
+ self._free_seq_group_cross_attn_blocks(seq_group)
1701
+
1702
+ # Add the finished requests to the finished requests list.
1703
+ # This list will be used to update the Mamba cache in the
1704
+ # next step.
1705
+ self._finished_requests_ids.append(seq_group.request_id)
1706
+
1707
+ # Free finished seqs
1708
+ self._free_finished_seqs(seq_group)
1709
+
1710
+ def free_finished_seq_groups(self) -> None:
1711
+ remaining: Deque[SequenceGroup] = deque()
1712
+ for seq_group in self.running:
1713
+ self._free_finished_seq_group(seq_group)
1714
+ if not seq_group.is_finished():
1715
+ remaining.append(seq_group)
1716
+
1717
+ self.running = remaining
1718
+
1719
+ # Handle async stopped sequence groups
1720
+ # (ones that reached max model len)
1721
+ if self._async_stopped:
1722
+ for seq_group in self._async_stopped:
1723
+ self._free_seq_group_cross_attn_blocks(seq_group)
1724
+ self._finished_requests_ids.append(seq_group.request_id)
1725
+
1726
+ # Free finished seqs
1727
+ self._free_finished_seqs(seq_group)
1728
+
1729
+ self._async_stopped.clear()
1730
+
1731
+ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
1732
+ self.block_manager.allocate(seq_group)
1733
+ for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
1734
+ seq.status = SequenceStatus.RUNNING
1735
+
1736
+ def _append_slots(
1737
+ self,
1738
+ seq_group: SequenceGroup,
1739
+ blocks_to_copy: List[Tuple[int, int]],
1740
+ enable_chunking: bool = False,
1741
+ ) -> None:
1742
+ """Appends new slots to the sequences in the given sequence group.
1743
+
1744
+ Args:
1745
+ seq_group (SequenceGroup): The sequence group containing the
1746
+ sequences to append slots to.
1747
+ blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
1748
+ ints, the first int is the source block index, and the second
1749
+ int is the destination block index. This list is updated with
1750
+ the new source and destination block indices for the appended
1751
+ slots.
1752
+ enable_chunking (bool): True if chunked prefill is enabled.
1753
+ """
1754
+ is_prefill: bool = seq_group.is_prefill()
1755
+ num_lookahead_slots: int = self._get_num_lookahead_slots(
1756
+ is_prefill, enable_chunking)
1757
+
1758
+ seq_group.init_multi_step_from_lookahead_slots(
1759
+ num_lookahead_slots,
1760
+ num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
1761
+ is_multi_step=self.scheduler_config.is_multi_step,
1762
+ enable_chunking=enable_chunking,
1763
+ )
1764
+
1765
+ seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
1766
+ if self.scheduler_config.is_multi_step and enable_chunking:
1767
+ # In multi-step chunked-prefill any sequence type can have
1768
+ # slots appended.
1769
+ seq_status = None
1770
+
1771
+ for seq in seq_group.get_seqs(status=seq_status):
1772
+ cows = self.block_manager.append_slots(seq, num_lookahead_slots)
1773
+ if len(cows) > 0:
1774
+ blocks_to_copy.extend(cows)
1775
+
1776
+ def _preempt(self, seq_group: SequenceGroup,
1777
+ blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
1778
+ # If preemption mode is not specified, we determine the mode as follows:
1779
+ # We use recomputation by default since it incurs lower overhead than
1780
+ # swapping. However, when the sequence group has multiple sequences
1781
+ # (e.g., beam search), recomputation is not currently supported. In
1782
+ # such a case, we use swapping instead.
1783
+ # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
1784
+ # As swapped sequences are prioritized over waiting sequences,
1785
+ # sequence groups with multiple sequences are implicitly prioritized
1786
+ # over sequence groups with a single sequence.
1787
+ # TODO(woosuk): Support recomputation for sequence groups with multiple
1788
+ # sequences. This may require a more sophisticated CUDA kernel.
1789
+ if self.user_specified_preemption_mode is None:
1790
+ if seq_group.get_max_num_running_seqs() == 1:
1791
+ preemption_mode = PreemptionMode.RECOMPUTE
1792
+ else:
1793
+ preemption_mode = PreemptionMode.SWAP
1794
+
1795
+ elif self.user_specified_preemption_mode == "swap":
1796
+ preemption_mode = PreemptionMode.SWAP
1797
+ else:
1798
+ preemption_mode = PreemptionMode.RECOMPUTE
1799
+
1800
+ if self.num_cumulative_preemption % 50 == 0:
1801
+ logger.warning(
1802
+ "Sequence group %s is preempted by %s mode because there is "
1803
+ "not enough KV cache space. This can affect the end-to-end "
1804
+ "performance. Increase gpu_memory_utilization or "
1805
+ "tensor_parallel_size to provide more KV cache memory. "
1806
+ "total_num_cumulative_preemption=%d",
1807
+ seq_group.request_id,
1808
+ preemption_mode,
1809
+ self.num_cumulative_preemption + 1,
1810
+ )
1811
+ self.num_cumulative_preemption += 1
1812
+
1813
+ if preemption_mode == PreemptionMode.RECOMPUTE:
1814
+ self._preempt_by_recompute(seq_group)
1815
+ elif preemption_mode == PreemptionMode.SWAP:
1816
+ self._preempt_by_swap(seq_group, blocks_to_swap_out)
1817
+ else:
1818
+ raise AssertionError("Invalid preemption mode.")
1819
+ return preemption_mode
1820
+
1821
+ def _preempt_by_recompute(
1822
+ self,
1823
+ seq_group: SequenceGroup,
1824
+ ) -> None:
1825
+ seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
1826
+ assert len(seqs) == 1
1827
+ for seq in seqs:
1828
+ seq.status = SequenceStatus.WAITING
1829
+ self.free_seq(seq)
1830
+ seq.reset_state_for_recompute()
1831
+ self._free_seq_group_cross_attn_blocks(seq_group)
1832
+
1833
+ def _preempt_by_swap(
1834
+ self,
1835
+ seq_group: SequenceGroup,
1836
+ blocks_to_swap_out: List[Tuple[int, int]],
1837
+ ) -> None:
1838
+ self._swap_out(seq_group, blocks_to_swap_out)
1839
+
1840
+ def _swap_in(
1841
+ self,
1842
+ seq_group: SequenceGroup,
1843
+ blocks_to_swap_in: List[Tuple[int, int]],
1844
+ ) -> None:
1845
+ mapping = self.block_manager.swap_in(seq_group)
1846
+ blocks_to_swap_in.extend(mapping)
1847
+ for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
1848
+ seq.status = SequenceStatus.RUNNING
1849
+
1850
+ def _swap_out(
1851
+ self,
1852
+ seq_group: SequenceGroup,
1853
+ blocks_to_swap_out: List[Tuple[int, int]],
1854
+ ) -> None:
1855
+ if not self.block_manager.can_swap_out(seq_group):
1856
+ # FIXME(woosuk): Abort the sequence group instead of aborting the
1857
+ # entire engine.
1858
+ raise RuntimeError(
1859
+ "Aborted due to the lack of CPU swap space. Please increase "
1860
+ "the swap space to avoid this error.")
1861
+ mapping = self.block_manager.swap_out(seq_group)
1862
+ blocks_to_swap_out.extend(mapping)
1863
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1864
+ seq.status = SequenceStatus.SWAPPED
1865
+
1866
+ def _passed_delay(self, now: float) -> bool:
1867
+ if self.prev_prompt:
1868
+ self.last_prompt_latency = now - self.prev_time
1869
+ self.prev_time, self.prev_prompt = now, False
1870
+ # Delay scheduling prompts to let waiting queue fill up
1871
+ if self.scheduler_config.delay_factor > 0 and self.waiting:
1872
+ earliest_arrival_time = min(
1873
+ [e.metrics.arrival_time for e in self.waiting])
1874
+ passed_delay = ((now - earliest_arrival_time)
1875
+ > (self.scheduler_config.delay_factor *
1876
+ self.last_prompt_latency) or not self.running)
1877
+ else:
1878
+ passed_delay = True
1879
+ return passed_delay
1880
+
1881
+ def _get_num_lookahead_slots(self, is_prefill: bool,
1882
+ enable_chunking: bool) -> int:
1883
+ """The number of slots to allocate per sequence per step, beyond known
1884
+ token ids. Speculative decoding uses these slots to store KV activations
1885
+ of tokens which may or may not be accepted.
1886
+
1887
+ Speculative decoding does not yet support prefill, so we do not perform
1888
+ lookahead allocation for prefill.
1889
+
1890
+ When chunking is enabled with multi-step, we allocate lookahead slots
1891
+ for the prefills for when the prefills turn into decodes in the first
1892
+ step.
1893
+ """
1894
+ if is_prefill:
1895
+ if self.scheduler_config.is_multi_step and enable_chunking:
1896
+ # num_lookahead_slots was introduced in the context of decodes,
1897
+ # in Speculative Decoding.
1898
+ # When the num_scheduler_steps is 8, say, then the
1899
+ # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
1900
+ # decode anyways and we wish to do 7 more.
1901
+ #
1902
+ # "lookaheads" for prefills, is introduced in support for
1903
+ # Chunked-Prefill in Multi-Step.
1904
+ return self.scheduler_config.num_lookahead_slots + 1
1905
+ else:
1906
+ return 0
1907
+
1908
+ return self.scheduler_config.num_lookahead_slots
1909
+
1910
+ def _get_num_new_uncached_and_cached_tokens(
1911
+ self,
1912
+ seq_group: SequenceGroup,
1913
+ status: SequenceStatus,
1914
+ enable_chunking: bool,
1915
+ budget: SchedulingBudget,
1916
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1917
+ ) -> Tuple[int, int]:
1918
+ """
1919
+ Returns the number of new uncached and cached tokens to schedule for a
1920
+ given sequence group that's in a given `status`.
1921
+
1922
+ The API could chunk the number of tokens to compute based on `budget`
1923
+ if `enable_chunking` is True. If a sequence group has multiple
1924
+ sequences (e.g., running beam search), it means it is in decoding
1925
+ phase, so chunking doesn't happen.
1926
+
1927
+ Returns (0, 0) if the new token cannot be computed due to token budget.
1928
+
1929
+ The cached tokens's blocks are already computed, and the attention
1930
+ backend will reuse the cached blocks rather than recomputing them. So
1931
+ the scheduler could schedule these cached tokens "for free".
1932
+
1933
+ Args:
1934
+ seq_group: The sequence group to get the number of new tokens to
1935
+ schedule.
1936
+ status: The status of the sequences to get the number of new tokens
1937
+ to schedule.
1938
+ enable_chunking: Whether to chunk the number of tokens to compute.
1939
+ budget: The budget to chunk the number of tokens to compute.
1940
+ partial_prefill_metadata: information about the partial prefills
1941
+ that are currently running
1942
+
1943
+
1944
+ Returns:
1945
+ A tuple of two ints. The first int is the number of new uncached
1946
+ tokens to schedule. The second int is the number of cached tokens.
1947
+ If no more new tokens can be scheduled, returns (0, 0).
1948
+ """
1949
+ num_cached_new_tokens = 0
1950
+ num_uncached_new_tokens = 0
1951
+
1952
+ seqs = seq_group.get_seqs(status=status)
1953
+ # Compute the number of new uncached and cached tokens for
1954
+ # each sequence.
1955
+ for seq in seqs:
1956
+ if not seq.is_prefill():
1957
+ # Decode sequences should always just have 1 uncached token
1958
+ # TODO(rickyx): Actually is this still correct for multi-step?
1959
+ num_uncached_new_tokens += 1
1960
+ continue
1961
+
1962
+ num_computed_tokens_seq = seq.get_num_computed_tokens()
1963
+ all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
1964
+ if not self.cache_config.enable_prefix_caching:
1965
+ # If prefix caching is not enabled, all new tokens are uncached.
1966
+ num_uncached_new_tokens += all_num_new_tokens_seq
1967
+ continue
1968
+
1969
+ # NOTE: the cache token might be currently in a block that's in an
1970
+ # evictor meaning that it's not yet allocated. However, we don't
1971
+ # exclude such tokens in the cache count because it will be
1972
+ # guaranteed to be allocated later if the sequence can be allocated.
1973
+ num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
1974
+ seq)
1975
+
1976
+ # Sanity check.
1977
+ if num_cached_tokens_seq < num_computed_tokens_seq:
1978
+ # This should only happen with chunked prefill, and
1979
+ # the seq is still in prefill. The `num_cached_tokens_seq`
1980
+ # is the value we calculated on scheduling the first prefill.
1981
+ # For subsequent continuous prefill steps, we cached the
1982
+ # number of cache tokens for the sequence so the cached token
1983
+ # count could be less than the number of computed tokens.
1984
+ # See comments on `ComputedBlocksTracker` for more details.
1985
+ assert (
1986
+ seq.is_prefill() and seq.status == SequenceStatus.RUNNING
1987
+ and self.scheduler_config.chunked_prefill_enabled
1988
+ ), ("Number of cached tokens should not be less than the "
1989
+ "number of computed tokens for a sequence that's still "
1990
+ f"in prefill. But there are {num_cached_tokens_seq} cached "
1991
+ f"tokens and {num_computed_tokens_seq} computed tokens "
1992
+ f"for sequence {seq.seq_id}.")
1993
+
1994
+ num_cached_new_tokens_seq = max(
1995
+ 0, num_cached_tokens_seq - num_computed_tokens_seq)
1996
+ num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
1997
+ num_cached_new_tokens_seq)
1998
+
1999
+ num_uncached_new_tokens += num_uncached_new_tokens_seq
2000
+ num_cached_new_tokens += num_cached_new_tokens_seq
2001
+
2002
+ if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
2003
+ # For a fully cached hit sequence, we actually need to recompute the
2004
+ # last token. So we need at least 1 uncached token to schedule.
2005
+ # See ModelRunner._compute_for_prefix_cache_hit for more details.
2006
+ num_uncached_new_tokens = 1
2007
+ num_cached_new_tokens -= 1
2008
+
2009
+ if enable_chunking and len(seqs) == 1:
2010
+ # Chunk if a running request cannot fit in the given budget.
2011
+ # If number of seq > 1, it means it is doing beam search
2012
+ # in a decode phase. Do not chunk.
2013
+ num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
2014
+ self.scheduler_config,
2015
+ self.cache_config,
2016
+ budget,
2017
+ self._get_prompt_limit(seq_group),
2018
+ num_uncached_new_tokens,
2019
+ self.partial_prefill_budget_lookup_list,
2020
+ partial_prefill_metadata,
2021
+ )
2022
+
2023
+ return num_uncached_new_tokens, num_cached_new_tokens
2024
+
2025
+ @staticmethod
2026
+ def _chunk_new_tokens_to_schedule(
2027
+ scheduler_config: SchedulerConfig,
2028
+ cache_config: CacheConfig,
2029
+ budget: SchedulingBudget,
2030
+ prompt_limit: int,
2031
+ num_new_tokens: int,
2032
+ partial_prefill_budget_lookup_list: List[int],
2033
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
2034
+ ) -> int:
2035
+ """
2036
+ Chunks the number of new tokens to schedule based on the budget when
2037
+ chunked prefill is enabled.
2038
+
2039
+ Args:
2040
+ scheduler_config: The scheduler config.
2041
+ cache_config: The cache config.
2042
+ budget: The budget to chunk the number of tokens to compute.
2043
+ prompt_limit: The maximum number of tokens allowed in a prompt.
2044
+ num_new_tokens: The number of new tokens to schedule.
2045
+
2046
+ Returns:
2047
+ The number of new tokens to schedule after chunking.
2048
+ """
2049
+ remaining_token_budget = budget.remaining_token_budget()
2050
+ if scheduler_config.is_multi_step:
2051
+ # The current multi-step + chunked prefill capability does
2052
+ # not actually support chunking prompts.
2053
+ #
2054
+ # Therefore, `num_new_tokens` is computed in the same fashion
2055
+ # for both multi-step+chunked-prefill &
2056
+ # multi-step+chunked-prefill+APC
2057
+ #
2058
+ # Prompts with more tokens than the current remaining budget
2059
+ # are postponed to future scheduler steps
2060
+ if num_new_tokens > prompt_limit:
2061
+ # If the seq_group is in prompt-stage, pass the
2062
+ # num_new_tokens as-is so the caller can ignore
2063
+ # the sequence.
2064
+ return num_new_tokens
2065
+
2066
+ return 0 if num_new_tokens > \
2067
+ remaining_token_budget else num_new_tokens
2068
+
2069
+ # Get the number of tokens to allocate to this prefill slot
2070
+ prefill_slot_budget = (
2071
+ remaining_token_budget if partial_prefill_metadata is None else
2072
+ partial_prefill_budget_lookup_list[
2073
+ partial_prefill_metadata.schedulable_prefills])
2074
+
2075
+ if cache_config.enable_prefix_caching:
2076
+ # When prefix caching is enabled and we're partially prefilling
2077
+ # a sequence, we always allocate a number of new tokens that is
2078
+ # divisible by the block size to avoid partial block matching.
2079
+ block_size = cache_config.block_size
2080
+ # Don't exceed either the total budget or slot budget.
2081
+ # Take min of those and get the next lowest multiple of the
2082
+ # block size:
2083
+ remaining_token_budget = (
2084
+ min(remaining_token_budget, prefill_slot_budget) //
2085
+ block_size) * block_size
2086
+ # NB: In the case where num_new_tokens < budget, we are
2087
+ # finishing prefill for this sequence, so we do not need to
2088
+ # allocate a full block.
2089
+
2090
+ num_new_tokens = min(num_new_tokens, remaining_token_budget,
2091
+ prefill_slot_budget)
2092
+
2093
+ return num_new_tokens