vllm-cpu-avx512vnni 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1395) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2022 -0
  5. vllm/_ipex_ops.py +404 -0
  6. vllm/_version.py +34 -0
  7. vllm/adapter_commons/__init__.py +0 -0
  8. vllm/adapter_commons/layers.py +16 -0
  9. vllm/adapter_commons/models.py +106 -0
  10. vllm/adapter_commons/request.py +26 -0
  11. vllm/adapter_commons/utils.py +93 -0
  12. vllm/adapter_commons/worker_manager.py +39 -0
  13. vllm/assets/__init__.py +0 -0
  14. vllm/assets/audio.py +45 -0
  15. vllm/assets/base.py +41 -0
  16. vllm/assets/image.py +50 -0
  17. vllm/assets/video.py +138 -0
  18. vllm/attention/__init__.py +19 -0
  19. vllm/attention/backends/__init__.py +0 -0
  20. vllm/attention/backends/abstract.py +348 -0
  21. vllm/attention/backends/differential_flash_attn.py +935 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1499 -0
  23. vllm/attention/backends/flash_attn.py +933 -0
  24. vllm/attention/backends/flashmla.py +238 -0
  25. vllm/attention/backends/mla/__init__.py +0 -0
  26. vllm/attention/backends/mla/common.py +1310 -0
  27. vllm/attention/backends/placeholder_attn.py +340 -0
  28. vllm/attention/backends/rocm_aiter_mla.py +410 -0
  29. vllm/attention/backends/rocm_flash_attn.py +953 -0
  30. vllm/attention/backends/triton_mla.py +111 -0
  31. vllm/attention/backends/utils.py +610 -0
  32. vllm/attention/backends/xformers.py +805 -0
  33. vllm/attention/layer.py +552 -0
  34. vllm/attention/layers/__init__.py +0 -0
  35. vllm/attention/layers/chunked_local_attention.py +91 -0
  36. vllm/attention/layers/cross_attention.py +159 -0
  37. vllm/attention/layers/encoder_only_attention.py +86 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  40. vllm/attention/ops/common.py +139 -0
  41. vllm/attention/ops/flashmla.py +123 -0
  42. vllm/attention/ops/merge_attn_states.py +43 -0
  43. vllm/attention/ops/paged_attn.py +261 -0
  44. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  45. vllm/attention/ops/prefix_prefill.py +928 -0
  46. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  47. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  48. vllm/attention/ops/triton_decode_attention.py +676 -0
  49. vllm/attention/ops/triton_flash_attention.py +984 -0
  50. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  51. vllm/attention/ops/triton_unified_attention.py +854 -0
  52. vllm/attention/selector.py +243 -0
  53. vllm/attention/utils/__init__.py +0 -0
  54. vllm/attention/utils/fa_utils.py +85 -0
  55. vllm/attention/utils/kv_sharing_utils.py +33 -0
  56. vllm/beam_search.py +87 -0
  57. vllm/benchmarks/__init__.py +0 -0
  58. vllm/benchmarks/datasets.py +2651 -0
  59. vllm/benchmarks/latency.py +170 -0
  60. vllm/benchmarks/lib/__init__.py +3 -0
  61. vllm/benchmarks/lib/endpoint_request_func.py +510 -0
  62. vllm/benchmarks/lib/ready_checker.py +72 -0
  63. vllm/benchmarks/lib/utils.py +80 -0
  64. vllm/benchmarks/serve.py +1247 -0
  65. vllm/benchmarks/throughput.py +696 -0
  66. vllm/collect_env.py +823 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/activation_quant_fusion.py +193 -0
  69. vllm/compilation/backends.py +641 -0
  70. vllm/compilation/base_static_graph.py +51 -0
  71. vllm/compilation/collective_fusion.py +1190 -0
  72. vllm/compilation/compiler_interface.py +572 -0
  73. vllm/compilation/counter.py +47 -0
  74. vllm/compilation/cuda_graph.py +193 -0
  75. vllm/compilation/cuda_piecewise_backend.py +117 -0
  76. vllm/compilation/decorators.py +316 -0
  77. vllm/compilation/fix_functionalization.py +208 -0
  78. vllm/compilation/fusion.py +600 -0
  79. vllm/compilation/fusion_attn.py +303 -0
  80. vllm/compilation/fx_utils.py +84 -0
  81. vllm/compilation/inductor_pass.py +136 -0
  82. vllm/compilation/monitor.py +57 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +165 -0
  85. vllm/compilation/pass_manager.py +88 -0
  86. vllm/compilation/sequence_parallelism.py +484 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +50 -0
  89. vllm/compilation/wrapper.py +138 -0
  90. vllm/config/__init__.py +3921 -0
  91. vllm/config/cache.py +214 -0
  92. vllm/config/compilation.py +580 -0
  93. vllm/config/kv_events.py +50 -0
  94. vllm/config/kv_transfer.py +111 -0
  95. vllm/config/load.py +113 -0
  96. vllm/config/lora.py +132 -0
  97. vllm/config/parallel.py +446 -0
  98. vllm/config/scheduler.py +304 -0
  99. vllm/config/utils.py +29 -0
  100. vllm/connections.py +174 -0
  101. vllm/core/__init__.py +0 -0
  102. vllm/core/block/__init__.py +0 -0
  103. vllm/core/block/block_table.py +399 -0
  104. vllm/core/block/common.py +371 -0
  105. vllm/core/block/cpu_gpu_block_allocator.py +439 -0
  106. vllm/core/block/interfaces.py +319 -0
  107. vllm/core/block/naive_block.py +466 -0
  108. vllm/core/block/prefix_caching_block.py +1135 -0
  109. vllm/core/block/utils.py +28 -0
  110. vllm/core/block_manager.py +523 -0
  111. vllm/core/evictor.py +157 -0
  112. vllm/core/interfaces.py +139 -0
  113. vllm/core/placeholder_block_space_manager.py +103 -0
  114. vllm/core/scheduler.py +2028 -0
  115. vllm/device_allocator/__init__.py +0 -0
  116. vllm/device_allocator/cumem.py +286 -0
  117. vllm/distributed/__init__.py +6 -0
  118. vllm/distributed/communication_op.py +41 -0
  119. vllm/distributed/device_communicators/__init__.py +0 -0
  120. vllm/distributed/device_communicators/all2all.py +259 -0
  121. vllm/distributed/device_communicators/all_reduce_utils.py +292 -0
  122. vllm/distributed/device_communicators/base_device_communicator.py +277 -0
  123. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  124. vllm/distributed/device_communicators/cuda_communicator.py +294 -0
  125. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  126. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  127. vllm/distributed/device_communicators/pynccl.py +290 -0
  128. vllm/distributed/device_communicators/pynccl_wrapper.py +382 -0
  129. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  130. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  131. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  132. vllm/distributed/device_communicators/symm_mem.py +136 -0
  133. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  134. vllm/distributed/device_communicators/xpu_communicator.py +69 -0
  135. vllm/distributed/eplb/__init__.py +8 -0
  136. vllm/distributed/eplb/eplb_state.py +619 -0
  137. vllm/distributed/eplb/rebalance_algo.py +234 -0
  138. vllm/distributed/eplb/rebalance_execute.py +424 -0
  139. vllm/distributed/kv_events.py +362 -0
  140. vllm/distributed/kv_transfer/README.md +29 -0
  141. vllm/distributed/kv_transfer/__init__.py +13 -0
  142. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  143. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  145. vllm/distributed/kv_transfer/kv_connector/factory.py +108 -0
  146. vllm/distributed/kv_transfer/kv_connector/utils.py +246 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/base.py +356 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +266 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1319 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +484 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +542 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +266 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +414 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  158. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  159. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  160. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  161. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  162. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  163. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  164. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  165. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  166. vllm/distributed/parallel_state.py +1489 -0
  167. vllm/distributed/tpu_distributed_utils.py +178 -0
  168. vllm/distributed/utils.py +536 -0
  169. vllm/engine/__init__.py +0 -0
  170. vllm/engine/arg_utils.py +1857 -0
  171. vllm/engine/async_llm_engine.py +1044 -0
  172. vllm/engine/async_timeout.py +173 -0
  173. vllm/engine/llm_engine.py +1849 -0
  174. vllm/engine/metrics.py +577 -0
  175. vllm/engine/metrics_types.py +84 -0
  176. vllm/engine/multiprocessing/__init__.py +145 -0
  177. vllm/engine/multiprocessing/client.py +643 -0
  178. vllm/engine/multiprocessing/engine.py +470 -0
  179. vllm/engine/output_processor/__init__.py +0 -0
  180. vllm/engine/output_processor/interfaces.py +61 -0
  181. vllm/engine/output_processor/single_step.py +145 -0
  182. vllm/engine/output_processor/stop_checker.py +131 -0
  183. vllm/engine/output_processor/util.py +28 -0
  184. vllm/engine/protocol.py +343 -0
  185. vllm/entrypoints/__init__.py +0 -0
  186. vllm/entrypoints/api_server.py +178 -0
  187. vllm/entrypoints/chat_utils.py +1535 -0
  188. vllm/entrypoints/cli/__init__.py +12 -0
  189. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  190. vllm/entrypoints/cli/benchmark/base.py +25 -0
  191. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  192. vllm/entrypoints/cli/benchmark/main.py +58 -0
  193. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  194. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  195. vllm/entrypoints/cli/collect_env.py +36 -0
  196. vllm/entrypoints/cli/main.py +60 -0
  197. vllm/entrypoints/cli/openai.py +214 -0
  198. vllm/entrypoints/cli/run_batch.py +69 -0
  199. vllm/entrypoints/cli/serve.py +232 -0
  200. vllm/entrypoints/cli/types.py +29 -0
  201. vllm/entrypoints/constants.py +10 -0
  202. vllm/entrypoints/context.py +444 -0
  203. vllm/entrypoints/harmony_utils.py +431 -0
  204. vllm/entrypoints/launcher.py +168 -0
  205. vllm/entrypoints/llm.py +1579 -0
  206. vllm/entrypoints/logger.py +79 -0
  207. vllm/entrypoints/openai/__init__.py +0 -0
  208. vllm/entrypoints/openai/api_server.py +2011 -0
  209. vllm/entrypoints/openai/cli_args.py +281 -0
  210. vllm/entrypoints/openai/logits_processors.py +90 -0
  211. vllm/entrypoints/openai/protocol.py +2590 -0
  212. vllm/entrypoints/openai/run_batch.py +497 -0
  213. vllm/entrypoints/openai/serving_chat.py +1591 -0
  214. vllm/entrypoints/openai/serving_classification.py +176 -0
  215. vllm/entrypoints/openai/serving_completion.py +688 -0
  216. vllm/entrypoints/openai/serving_embedding.py +632 -0
  217. vllm/entrypoints/openai/serving_engine.py +996 -0
  218. vllm/entrypoints/openai/serving_models.py +288 -0
  219. vllm/entrypoints/openai/serving_pooling.py +277 -0
  220. vllm/entrypoints/openai/serving_responses.py +1690 -0
  221. vllm/entrypoints/openai/serving_score.py +479 -0
  222. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  223. vllm/entrypoints/openai/serving_transcription.py +136 -0
  224. vllm/entrypoints/openai/speech_to_text.py +388 -0
  225. vllm/entrypoints/openai/tool_parsers/__init__.py +51 -0
  226. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  227. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  228. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  229. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  230. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  231. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  232. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +418 -0
  233. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  234. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  235. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  236. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  237. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  238. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  239. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  240. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  241. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +73 -0
  242. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  243. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  244. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  245. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  246. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  247. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  248. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  249. vllm/entrypoints/renderer.py +395 -0
  250. vllm/entrypoints/score_utils.py +232 -0
  251. vllm/entrypoints/ssl.py +75 -0
  252. vllm/entrypoints/tool.py +139 -0
  253. vllm/entrypoints/tool_server.py +195 -0
  254. vllm/entrypoints/utils.py +328 -0
  255. vllm/env_override.py +23 -0
  256. vllm/envs.py +1354 -0
  257. vllm/executor/__init__.py +0 -0
  258. vllm/executor/executor_base.py +378 -0
  259. vllm/executor/mp_distributed_executor.py +244 -0
  260. vllm/executor/msgspec_utils.py +35 -0
  261. vllm/executor/multiproc_worker_utils.py +279 -0
  262. vllm/executor/ray_distributed_executor.py +699 -0
  263. vllm/executor/ray_utils.py +410 -0
  264. vllm/executor/uniproc_executor.py +152 -0
  265. vllm/forward_context.py +273 -0
  266. vllm/inputs/__init__.py +44 -0
  267. vllm/inputs/data.py +356 -0
  268. vllm/inputs/parse.py +151 -0
  269. vllm/inputs/preprocess.py +973 -0
  270. vllm/inputs/registry.py +251 -0
  271. vllm/logger.py +229 -0
  272. vllm/logging_utils/__init__.py +8 -0
  273. vllm/logging_utils/dump_input.py +81 -0
  274. vllm/logging_utils/formatter.py +79 -0
  275. vllm/logits_process.py +119 -0
  276. vllm/logprobs.py +28 -0
  277. vllm/lora/__init__.py +0 -0
  278. vllm/lora/layers/__init__.py +34 -0
  279. vllm/lora/layers/base.py +69 -0
  280. vllm/lora/layers/base_linear.py +184 -0
  281. vllm/lora/layers/column_parallel_linear.py +622 -0
  282. vllm/lora/layers/logits_processor.py +247 -0
  283. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  284. vllm/lora/layers/replicated_linear.py +61 -0
  285. vllm/lora/layers/row_parallel_linear.py +201 -0
  286. vllm/lora/layers/utils.py +60 -0
  287. vllm/lora/layers/vocal_parallel_embedding.py +172 -0
  288. vllm/lora/lora.py +199 -0
  289. vllm/lora/models.py +792 -0
  290. vllm/lora/ops/__init__.py +0 -0
  291. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  292. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  293. vllm/lora/ops/torch_ops/__init__.py +16 -0
  294. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  295. vllm/lora/ops/triton_ops/__init__.py +12 -0
  296. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  297. vllm/lora/ops/triton_ops/lora_expand_op.py +291 -0
  298. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  299. vllm/lora/ops/triton_ops/lora_shrink_op.py +245 -0
  300. vllm/lora/ops/triton_ops/utils.py +126 -0
  301. vllm/lora/ops/xla_ops/__init__.py +7 -0
  302. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  303. vllm/lora/peft_helper.py +127 -0
  304. vllm/lora/punica_wrapper/__init__.py +10 -0
  305. vllm/lora/punica_wrapper/punica_base.py +458 -0
  306. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  307. vllm/lora/punica_wrapper/punica_gpu.py +279 -0
  308. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  309. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  310. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  311. vllm/lora/punica_wrapper/utils.py +136 -0
  312. vllm/lora/request.py +99 -0
  313. vllm/lora/resolver.py +85 -0
  314. vllm/lora/utils.py +246 -0
  315. vllm/lora/worker_manager.py +256 -0
  316. vllm/model_executor/__init__.py +16 -0
  317. vllm/model_executor/custom_op.py +194 -0
  318. vllm/model_executor/layers/__init__.py +0 -0
  319. vllm/model_executor/layers/activation.py +575 -0
  320. vllm/model_executor/layers/attention_layer_base.py +23 -0
  321. vllm/model_executor/layers/fla/__init__.py +8 -0
  322. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  323. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  324. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  325. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  326. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  327. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  328. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  329. vllm/model_executor/layers/fla/ops/index.py +39 -0
  330. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  331. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  332. vllm/model_executor/layers/fla/ops/op.py +39 -0
  333. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  334. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  335. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  336. vllm/model_executor/layers/fused_moe/__init__.py +80 -0
  337. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +304 -0
  338. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +164 -0
  339. vllm/model_executor/layers/fused_moe/config.py +497 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  560. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +297 -0
  561. vllm/model_executor/layers/fused_moe/cutlass_moe.py +996 -0
  562. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +370 -0
  563. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  564. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +280 -0
  565. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +229 -0
  566. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +243 -0
  567. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +97 -0
  568. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1042 -0
  569. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +240 -0
  570. vllm/model_executor/layers/fused_moe/fused_moe.py +2081 -0
  571. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +247 -0
  572. vllm/model_executor/layers/fused_moe/layer.py +1951 -0
  573. vllm/model_executor/layers/fused_moe/modular_kernel.py +892 -0
  574. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  575. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  576. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  577. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  578. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +321 -0
  579. vllm/model_executor/layers/fused_moe/prepare_finalize.py +72 -0
  580. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +431 -0
  581. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  582. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  583. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +171 -0
  584. vllm/model_executor/layers/fused_moe/trtllm_moe.py +197 -0
  585. vllm/model_executor/layers/fused_moe/utils.py +270 -0
  586. vllm/model_executor/layers/layernorm.py +381 -0
  587. vllm/model_executor/layers/lightning_attn.py +661 -0
  588. vllm/model_executor/layers/linear.py +1567 -0
  589. vllm/model_executor/layers/logits_processor.py +199 -0
  590. vllm/model_executor/layers/mamba/__init__.py +0 -0
  591. vllm/model_executor/layers/mamba/abstract.py +45 -0
  592. vllm/model_executor/layers/mamba/linear_attn.py +432 -0
  593. vllm/model_executor/layers/mamba/mamba2_metadata.py +186 -0
  594. vllm/model_executor/layers/mamba/mamba_mixer.py +517 -0
  595. vllm/model_executor/layers/mamba/mamba_mixer2.py +803 -0
  596. vllm/model_executor/layers/mamba/mamba_utils.py +202 -0
  597. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  598. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +982 -0
  599. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  600. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  601. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  602. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +574 -0
  603. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  604. vllm/model_executor/layers/mamba/ops/ssd_combined.py +248 -0
  605. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +248 -0
  606. vllm/model_executor/layers/mamba/short_conv.py +270 -0
  607. vllm/model_executor/layers/mla.py +158 -0
  608. vllm/model_executor/layers/pooler.py +732 -0
  609. vllm/model_executor/layers/quantization/__init__.py +157 -0
  610. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  611. vllm/model_executor/layers/quantization/awq.py +228 -0
  612. vllm/model_executor/layers/quantization/awq_marlin.py +548 -0
  613. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  614. vllm/model_executor/layers/quantization/base_config.py +164 -0
  615. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  616. vllm/model_executor/layers/quantization/bitsandbytes.py +621 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +795 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1651 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  625. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +161 -0
  626. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  627. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  628. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  629. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +156 -0
  630. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  631. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  632. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +227 -0
  633. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +135 -0
  634. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +21 -0
  635. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  636. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  637. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  638. vllm/model_executor/layers/quantization/deepgemm.py +81 -0
  639. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  640. vllm/model_executor/layers/quantization/experts_int8.py +215 -0
  641. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  642. vllm/model_executor/layers/quantization/fp8.py +1179 -0
  643. vllm/model_executor/layers/quantization/gguf.py +597 -0
  644. vllm/model_executor/layers/quantization/gptq.py +300 -0
  645. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  646. vllm/model_executor/layers/quantization/gptq_marlin.py +700 -0
  647. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  648. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  649. vllm/model_executor/layers/quantization/inc.py +61 -0
  650. vllm/model_executor/layers/quantization/input_quant_fp8.py +103 -0
  651. vllm/model_executor/layers/quantization/ipex_quant.py +410 -0
  652. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  653. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  654. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  655. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  656. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  657. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  658. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  659. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  660. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  661. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  662. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  663. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  664. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  665. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +163 -0
  666. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  667. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  668. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  669. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  670. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  671. vllm/model_executor/layers/quantization/modelopt.py +1548 -0
  672. vllm/model_executor/layers/quantization/moe_wna16.py +473 -0
  673. vllm/model_executor/layers/quantization/mxfp4.py +951 -0
  674. vllm/model_executor/layers/quantization/petit.py +306 -0
  675. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  676. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  677. vllm/model_executor/layers/quantization/quark/quark.py +431 -0
  678. vllm/model_executor/layers/quantization/quark/quark_moe.py +434 -0
  679. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  680. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  681. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +112 -0
  682. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  683. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  684. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  685. vllm/model_executor/layers/quantization/rtn.py +456 -0
  686. vllm/model_executor/layers/quantization/schema.py +86 -0
  687. vllm/model_executor/layers/quantization/torchao.py +214 -0
  688. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  689. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  690. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  691. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  903. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +85 -0
  904. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +258 -0
  905. vllm/model_executor/layers/quantization/utils/fp8_utils.py +795 -0
  906. vllm/model_executor/layers/quantization/utils/gptq_utils.py +96 -0
  907. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  908. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  909. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  910. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  911. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  912. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  913. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  914. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  915. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +132 -0
  916. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  917. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  918. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  919. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  920. vllm/model_executor/layers/quantization/utils/quant_utils.py +627 -0
  921. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  922. vllm/model_executor/layers/resampler.py +270 -0
  923. vllm/model_executor/layers/rotary_embedding/__init__.py +190 -0
  924. vllm/model_executor/layers/rotary_embedding/base.py +156 -0
  925. vllm/model_executor/layers/rotary_embedding/common.py +105 -0
  926. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +140 -0
  927. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  928. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  929. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  930. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  931. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  932. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  933. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  934. vllm/model_executor/layers/rotary_embedding/mrope.py +1140 -0
  935. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  936. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  937. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  938. vllm/model_executor/layers/sampler.py +1198 -0
  939. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  940. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  941. vllm/model_executor/layers/utils.py +196 -0
  942. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  943. vllm/model_executor/model_loader/__init__.py +138 -0
  944. vllm/model_executor/model_loader/base_loader.py +52 -0
  945. vllm/model_executor/model_loader/bitsandbytes_loader.py +787 -0
  946. vllm/model_executor/model_loader/default_loader.py +278 -0
  947. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  948. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  949. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  950. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  951. vllm/model_executor/model_loader/tensorizer.py +743 -0
  952. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  953. vllm/model_executor/model_loader/tpu.py +114 -0
  954. vllm/model_executor/model_loader/utils.py +271 -0
  955. vllm/model_executor/model_loader/weight_utils.py +946 -0
  956. vllm/model_executor/models/__init__.py +30 -0
  957. vllm/model_executor/models/adapters.py +542 -0
  958. vllm/model_executor/models/aimv2.py +246 -0
  959. vllm/model_executor/models/apertus.py +582 -0
  960. vllm/model_executor/models/arcee.py +423 -0
  961. vllm/model_executor/models/arctic.py +560 -0
  962. vllm/model_executor/models/aria.py +662 -0
  963. vllm/model_executor/models/aya_vision.py +470 -0
  964. vllm/model_executor/models/baichuan.py +475 -0
  965. vllm/model_executor/models/bailing_moe.py +529 -0
  966. vllm/model_executor/models/bamba.py +582 -0
  967. vllm/model_executor/models/bart.py +1343 -0
  968. vllm/model_executor/models/bert.py +613 -0
  969. vllm/model_executor/models/bert_with_rope.py +687 -0
  970. vllm/model_executor/models/blip.py +339 -0
  971. vllm/model_executor/models/blip2.py +716 -0
  972. vllm/model_executor/models/bloom.py +374 -0
  973. vllm/model_executor/models/chameleon.py +1141 -0
  974. vllm/model_executor/models/chatglm.py +479 -0
  975. vllm/model_executor/models/clip.py +407 -0
  976. vllm/model_executor/models/cohere2_vision.py +484 -0
  977. vllm/model_executor/models/commandr.py +467 -0
  978. vllm/model_executor/models/config.py +434 -0
  979. vllm/model_executor/models/constant_size_cache.py +137 -0
  980. vllm/model_executor/models/dbrx.py +473 -0
  981. vllm/model_executor/models/deepseek.py +491 -0
  982. vllm/model_executor/models/deepseek_eagle.py +241 -0
  983. vllm/model_executor/models/deepseek_mtp.py +282 -0
  984. vllm/model_executor/models/deepseek_v2.py +1058 -0
  985. vllm/model_executor/models/deepseek_vl2.py +661 -0
  986. vllm/model_executor/models/donut.py +387 -0
  987. vllm/model_executor/models/dots1.py +547 -0
  988. vllm/model_executor/models/ernie45.py +43 -0
  989. vllm/model_executor/models/ernie45_moe.py +608 -0
  990. vllm/model_executor/models/ernie45_vl.py +1510 -0
  991. vllm/model_executor/models/ernie45_vl_moe.py +728 -0
  992. vllm/model_executor/models/ernie_mtp.py +287 -0
  993. vllm/model_executor/models/exaone.py +552 -0
  994. vllm/model_executor/models/exaone4.py +535 -0
  995. vllm/model_executor/models/fairseq2_llama.py +154 -0
  996. vllm/model_executor/models/falcon.py +511 -0
  997. vllm/model_executor/models/falcon_h1.py +739 -0
  998. vllm/model_executor/models/florence2.py +1107 -0
  999. vllm/model_executor/models/fuyu.py +401 -0
  1000. vllm/model_executor/models/gemma.py +428 -0
  1001. vllm/model_executor/models/gemma2.py +425 -0
  1002. vllm/model_executor/models/gemma3.py +542 -0
  1003. vllm/model_executor/models/gemma3_mm.py +723 -0
  1004. vllm/model_executor/models/gemma3n.py +830 -0
  1005. vllm/model_executor/models/gemma3n_mm.py +767 -0
  1006. vllm/model_executor/models/glm.py +23 -0
  1007. vllm/model_executor/models/glm4.py +305 -0
  1008. vllm/model_executor/models/glm4_1v.py +1669 -0
  1009. vllm/model_executor/models/glm4_moe.py +703 -0
  1010. vllm/model_executor/models/glm4_moe_mtp.py +306 -0
  1011. vllm/model_executor/models/glm4v.py +654 -0
  1012. vllm/model_executor/models/gpt2.py +383 -0
  1013. vllm/model_executor/models/gpt_bigcode.py +346 -0
  1014. vllm/model_executor/models/gpt_j.py +340 -0
  1015. vllm/model_executor/models/gpt_neox.py +333 -0
  1016. vllm/model_executor/models/gpt_oss.py +687 -0
  1017. vllm/model_executor/models/granite.py +498 -0
  1018. vllm/model_executor/models/granite_speech.py +799 -0
  1019. vllm/model_executor/models/granitemoe.py +541 -0
  1020. vllm/model_executor/models/granitemoehybrid.py +684 -0
  1021. vllm/model_executor/models/granitemoeshared.py +342 -0
  1022. vllm/model_executor/models/gritlm.py +262 -0
  1023. vllm/model_executor/models/grok1.py +550 -0
  1024. vllm/model_executor/models/h2ovl.py +536 -0
  1025. vllm/model_executor/models/hunyuan_v1.py +937 -0
  1026. vllm/model_executor/models/hyperclovax_vision.py +1206 -0
  1027. vllm/model_executor/models/idefics2_vision_model.py +416 -0
  1028. vllm/model_executor/models/idefics3.py +758 -0
  1029. vllm/model_executor/models/interfaces.py +854 -0
  1030. vllm/model_executor/models/interfaces_base.py +195 -0
  1031. vllm/model_executor/models/intern_vit.py +481 -0
  1032. vllm/model_executor/models/internlm2.py +453 -0
  1033. vllm/model_executor/models/internlm2_ve.py +148 -0
  1034. vllm/model_executor/models/interns1.py +832 -0
  1035. vllm/model_executor/models/interns1_vit.py +418 -0
  1036. vllm/model_executor/models/internvl.py +1423 -0
  1037. vllm/model_executor/models/jais.py +374 -0
  1038. vllm/model_executor/models/jamba.py +630 -0
  1039. vllm/model_executor/models/jina_vl.py +144 -0
  1040. vllm/model_executor/models/keye.py +1684 -0
  1041. vllm/model_executor/models/keye_vl1_5.py +601 -0
  1042. vllm/model_executor/models/kimi_vl.py +620 -0
  1043. vllm/model_executor/models/lfm2.py +558 -0
  1044. vllm/model_executor/models/llama.py +671 -0
  1045. vllm/model_executor/models/llama4.py +732 -0
  1046. vllm/model_executor/models/llama4_eagle.py +241 -0
  1047. vllm/model_executor/models/llama_eagle.py +171 -0
  1048. vllm/model_executor/models/llama_eagle3.py +292 -0
  1049. vllm/model_executor/models/llava.py +872 -0
  1050. vllm/model_executor/models/llava_next.py +572 -0
  1051. vllm/model_executor/models/llava_next_video.py +479 -0
  1052. vllm/model_executor/models/llava_onevision.py +945 -0
  1053. vllm/model_executor/models/mamba.py +310 -0
  1054. vllm/model_executor/models/mamba2.py +346 -0
  1055. vllm/model_executor/models/mamba_cache.py +83 -0
  1056. vllm/model_executor/models/medusa.py +219 -0
  1057. vllm/model_executor/models/midashenglm.py +788 -0
  1058. vllm/model_executor/models/mimo.py +191 -0
  1059. vllm/model_executor/models/mimo_mtp.py +273 -0
  1060. vllm/model_executor/models/minicpm.py +593 -0
  1061. vllm/model_executor/models/minicpm3.py +230 -0
  1062. vllm/model_executor/models/minicpm_eagle.py +391 -0
  1063. vllm/model_executor/models/minicpmo.py +804 -0
  1064. vllm/model_executor/models/minicpmv.py +1786 -0
  1065. vllm/model_executor/models/minimax_cache.py +36 -0
  1066. vllm/model_executor/models/minimax_text_01.py +1027 -0
  1067. vllm/model_executor/models/minimax_vl_01.py +431 -0
  1068. vllm/model_executor/models/mistral3.py +628 -0
  1069. vllm/model_executor/models/mixtral.py +494 -0
  1070. vllm/model_executor/models/mllama.py +1697 -0
  1071. vllm/model_executor/models/mllama4.py +1079 -0
  1072. vllm/model_executor/models/mlp_speculator.py +206 -0
  1073. vllm/model_executor/models/modernbert.py +374 -0
  1074. vllm/model_executor/models/module_mapping.py +72 -0
  1075. vllm/model_executor/models/molmo.py +1569 -0
  1076. vllm/model_executor/models/moonvit.py +663 -0
  1077. vllm/model_executor/models/motif.py +345 -0
  1078. vllm/model_executor/models/mpt.py +332 -0
  1079. vllm/model_executor/models/nano_nemotron_vl.py +1395 -0
  1080. vllm/model_executor/models/nemotron.py +509 -0
  1081. vllm/model_executor/models/nemotron_h.py +633 -0
  1082. vllm/model_executor/models/nemotron_nas.py +484 -0
  1083. vllm/model_executor/models/nemotron_vl.py +655 -0
  1084. vllm/model_executor/models/nvlm_d.py +203 -0
  1085. vllm/model_executor/models/olmo.py +406 -0
  1086. vllm/model_executor/models/olmo2.py +428 -0
  1087. vllm/model_executor/models/olmoe.py +485 -0
  1088. vllm/model_executor/models/opt.py +413 -0
  1089. vllm/model_executor/models/orion.py +350 -0
  1090. vllm/model_executor/models/ovis.py +572 -0
  1091. vllm/model_executor/models/ovis2_5.py +644 -0
  1092. vllm/model_executor/models/paligemma.py +414 -0
  1093. vllm/model_executor/models/persimmon.py +345 -0
  1094. vllm/model_executor/models/phi.py +357 -0
  1095. vllm/model_executor/models/phi3.py +19 -0
  1096. vllm/model_executor/models/phi3v.py +701 -0
  1097. vllm/model_executor/models/phi4_multimodal.py +1478 -0
  1098. vllm/model_executor/models/phi4flash.py +737 -0
  1099. vllm/model_executor/models/phi4mm.py +1281 -0
  1100. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1101. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1102. vllm/model_executor/models/phimoe.py +681 -0
  1103. vllm/model_executor/models/pixtral.py +1348 -0
  1104. vllm/model_executor/models/plamo2.py +1126 -0
  1105. vllm/model_executor/models/qwen.py +363 -0
  1106. vllm/model_executor/models/qwen2.py +526 -0
  1107. vllm/model_executor/models/qwen2_5_omni_thinker.py +985 -0
  1108. vllm/model_executor/models/qwen2_5_vl.py +1256 -0
  1109. vllm/model_executor/models/qwen2_audio.py +492 -0
  1110. vllm/model_executor/models/qwen2_moe.py +558 -0
  1111. vllm/model_executor/models/qwen2_rm.py +122 -0
  1112. vllm/model_executor/models/qwen2_vl.py +1512 -0
  1113. vllm/model_executor/models/qwen3.py +344 -0
  1114. vllm/model_executor/models/qwen3_moe.py +704 -0
  1115. vllm/model_executor/models/qwen3_next.py +1298 -0
  1116. vllm/model_executor/models/qwen3_next_mtp.py +285 -0
  1117. vllm/model_executor/models/qwen_vl.py +795 -0
  1118. vllm/model_executor/models/registry.py +891 -0
  1119. vllm/model_executor/models/roberta.py +252 -0
  1120. vllm/model_executor/models/rvl.py +103 -0
  1121. vllm/model_executor/models/seed_oss.py +488 -0
  1122. vllm/model_executor/models/siglip.py +524 -0
  1123. vllm/model_executor/models/siglip2navit.py +688 -0
  1124. vllm/model_executor/models/skyworkr1v.py +914 -0
  1125. vllm/model_executor/models/smolvlm.py +44 -0
  1126. vllm/model_executor/models/solar.py +506 -0
  1127. vllm/model_executor/models/stablelm.py +344 -0
  1128. vllm/model_executor/models/starcoder2.py +357 -0
  1129. vllm/model_executor/models/step3_text.py +521 -0
  1130. vllm/model_executor/models/step3_vl.py +1091 -0
  1131. vllm/model_executor/models/swin.py +475 -0
  1132. vllm/model_executor/models/tarsier.py +649 -0
  1133. vllm/model_executor/models/telechat2.py +151 -0
  1134. vllm/model_executor/models/teleflm.py +79 -0
  1135. vllm/model_executor/models/terratorch.py +294 -0
  1136. vllm/model_executor/models/transformers.py +883 -0
  1137. vllm/model_executor/models/ultravox.py +667 -0
  1138. vllm/model_executor/models/utils.py +770 -0
  1139. vllm/model_executor/models/vision.py +125 -0
  1140. vllm/model_executor/models/voxtral.py +789 -0
  1141. vllm/model_executor/models/whisper.py +966 -0
  1142. vllm/model_executor/models/zamba2.py +1056 -0
  1143. vllm/model_executor/parameter.py +599 -0
  1144. vllm/model_executor/sampling_metadata.py +597 -0
  1145. vllm/model_executor/utils.py +97 -0
  1146. vllm/model_executor/warmup/__init__.py +0 -0
  1147. vllm/model_executor/warmup/deep_gemm_warmup.py +223 -0
  1148. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1149. vllm/multimodal/__init__.py +35 -0
  1150. vllm/multimodal/audio.py +116 -0
  1151. vllm/multimodal/base.py +219 -0
  1152. vllm/multimodal/cache.py +507 -0
  1153. vllm/multimodal/hasher.py +110 -0
  1154. vllm/multimodal/image.py +130 -0
  1155. vllm/multimodal/inputs.py +979 -0
  1156. vllm/multimodal/parse.py +496 -0
  1157. vllm/multimodal/processing.py +1921 -0
  1158. vllm/multimodal/profiling.py +313 -0
  1159. vllm/multimodal/registry.py +375 -0
  1160. vllm/multimodal/utils.py +754 -0
  1161. vllm/multimodal/video.py +312 -0
  1162. vllm/outputs.py +517 -0
  1163. vllm/platforms/__init__.py +263 -0
  1164. vllm/platforms/cpu.py +353 -0
  1165. vllm/platforms/cuda.py +731 -0
  1166. vllm/platforms/interface.py +599 -0
  1167. vllm/platforms/rocm.py +504 -0
  1168. vllm/platforms/tpu.py +236 -0
  1169. vllm/platforms/xpu.py +243 -0
  1170. vllm/plugins/__init__.py +72 -0
  1171. vllm/plugins/io_processors/__init__.py +68 -0
  1172. vllm/plugins/io_processors/interface.py +67 -0
  1173. vllm/plugins/lora_resolvers/README.md +16 -0
  1174. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1175. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1176. vllm/pooling_params.py +183 -0
  1177. vllm/profiler/__init__.py +0 -0
  1178. vllm/profiler/layerwise_profile.py +375 -0
  1179. vllm/profiler/utils.py +148 -0
  1180. vllm/py.typed +2 -0
  1181. vllm/ray/__init__.py +0 -0
  1182. vllm/ray/lazy_utils.py +22 -0
  1183. vllm/ray/ray_env.py +72 -0
  1184. vllm/reasoning/__init__.py +25 -0
  1185. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1186. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1187. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1188. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1189. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1190. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1191. vllm/reasoning/mistral_reasoning_parser.py +47 -0
  1192. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1193. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1194. vllm/sampling_params.py +577 -0
  1195. vllm/scalar_type.py +349 -0
  1196. vllm/scripts.py +15 -0
  1197. vllm/sequence.py +1465 -0
  1198. vllm/tasks.py +11 -0
  1199. vllm/test_utils.py +130 -0
  1200. vllm/third_party/__init__.py +0 -0
  1201. vllm/third_party/pynvml.py +6140 -0
  1202. vllm/tracing.py +136 -0
  1203. vllm/transformers_utils/__init__.py +24 -0
  1204. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1205. vllm/transformers_utils/chat_templates/registry.py +71 -0
  1206. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1207. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1208. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1209. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1210. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1211. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1212. vllm/transformers_utils/config.py +1043 -0
  1213. vllm/transformers_utils/config_parser_base.py +20 -0
  1214. vllm/transformers_utils/configs/__init__.py +55 -0
  1215. vllm/transformers_utils/configs/arctic.py +207 -0
  1216. vllm/transformers_utils/configs/chatglm.py +72 -0
  1217. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1218. vllm/transformers_utils/configs/eagle.py +84 -0
  1219. vllm/transformers_utils/configs/falcon.py +90 -0
  1220. vllm/transformers_utils/configs/jais.py +238 -0
  1221. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1222. vllm/transformers_utils/configs/medusa.py +63 -0
  1223. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1224. vllm/transformers_utils/configs/mistral.py +165 -0
  1225. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1226. vllm/transformers_utils/configs/moonvit.py +33 -0
  1227. vllm/transformers_utils/configs/nemotron.py +205 -0
  1228. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1229. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1230. vllm/transformers_utils/configs/ovis.py +176 -0
  1231. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1232. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1233. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1234. vllm/transformers_utils/configs/speculators/base.py +91 -0
  1235. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1236. vllm/transformers_utils/configs/ultravox.py +120 -0
  1237. vllm/transformers_utils/detokenizer.py +169 -0
  1238. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1239. vllm/transformers_utils/dynamic_module.py +60 -0
  1240. vllm/transformers_utils/processor.py +245 -0
  1241. vllm/transformers_utils/processors/__init__.py +16 -0
  1242. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1243. vllm/transformers_utils/processors/ovis.py +420 -0
  1244. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1245. vllm/transformers_utils/runai_utils.py +99 -0
  1246. vllm/transformers_utils/s3_utils.py +90 -0
  1247. vllm/transformers_utils/tokenizer.py +293 -0
  1248. vllm/transformers_utils/tokenizer_base.py +149 -0
  1249. vllm/transformers_utils/tokenizer_group.py +132 -0
  1250. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1251. vllm/transformers_utils/tokenizers/mistral.py +520 -0
  1252. vllm/transformers_utils/utils.py +99 -0
  1253. vllm/triton_utils/__init__.py +16 -0
  1254. vllm/triton_utils/importing.py +95 -0
  1255. vllm/usage/__init__.py +0 -0
  1256. vllm/usage/usage_lib.py +259 -0
  1257. vllm/utils/__init__.py +3438 -0
  1258. vllm/utils/deep_gemm.py +212 -0
  1259. vllm/utils/flashinfer.py +372 -0
  1260. vllm/utils/jsontree.py +90 -0
  1261. vllm/utils/tensor_schema.py +236 -0
  1262. vllm/v1/__init__.py +0 -0
  1263. vllm/v1/attention/__init__.py +0 -0
  1264. vllm/v1/attention/backends/__init__.py +0 -0
  1265. vllm/v1/attention/backends/cpu_attn.py +922 -0
  1266. vllm/v1/attention/backends/flash_attn.py +800 -0
  1267. vllm/v1/attention/backends/flashinfer.py +1128 -0
  1268. vllm/v1/attention/backends/flex_attention.py +796 -0
  1269. vllm/v1/attention/backends/gdn_attn.py +320 -0
  1270. vllm/v1/attention/backends/linear_attn.py +68 -0
  1271. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1272. vllm/v1/attention/backends/mamba2_attn.py +224 -0
  1273. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1274. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1275. vllm/v1/attention/backends/mla/common.py +1608 -0
  1276. vllm/v1/attention/backends/mla/cutlass_mla.py +301 -0
  1277. vllm/v1/attention/backends/mla/flashattn_mla.py +273 -0
  1278. vllm/v1/attention/backends/mla/flashinfer_mla.py +110 -0
  1279. vllm/v1/attention/backends/mla/flashmla.py +213 -0
  1280. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1281. vllm/v1/attention/backends/mla/triton_mla.py +175 -0
  1282. vllm/v1/attention/backends/pallas.py +413 -0
  1283. vllm/v1/attention/backends/rocm_aiter_fa.py +548 -0
  1284. vllm/v1/attention/backends/short_conv_attn.py +82 -0
  1285. vllm/v1/attention/backends/tree_attn.py +450 -0
  1286. vllm/v1/attention/backends/triton_attn.py +430 -0
  1287. vllm/v1/attention/backends/utils.py +834 -0
  1288. vllm/v1/attention/backends/xformers.py +437 -0
  1289. vllm/v1/core/__init__.py +0 -0
  1290. vllm/v1/core/block_pool.py +330 -0
  1291. vllm/v1/core/encoder_cache_manager.py +333 -0
  1292. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1293. vllm/v1/core/kv_cache_manager.py +398 -0
  1294. vllm/v1/core/kv_cache_utils.py +1169 -0
  1295. vllm/v1/core/sched/__init__.py +0 -0
  1296. vllm/v1/core/sched/async_scheduler.py +47 -0
  1297. vllm/v1/core/sched/interface.py +158 -0
  1298. vllm/v1/core/sched/output.py +162 -0
  1299. vllm/v1/core/sched/request_queue.py +224 -0
  1300. vllm/v1/core/sched/scheduler.py +1287 -0
  1301. vllm/v1/core/sched/utils.py +69 -0
  1302. vllm/v1/core/single_type_kv_cache_manager.py +670 -0
  1303. vllm/v1/cudagraph_dispatcher.py +121 -0
  1304. vllm/v1/engine/__init__.py +202 -0
  1305. vllm/v1/engine/async_llm.py +757 -0
  1306. vllm/v1/engine/coordinator.py +357 -0
  1307. vllm/v1/engine/core.py +1245 -0
  1308. vllm/v1/engine/core_client.py +1333 -0
  1309. vllm/v1/engine/detokenizer.py +300 -0
  1310. vllm/v1/engine/exceptions.py +17 -0
  1311. vllm/v1/engine/llm_engine.py +332 -0
  1312. vllm/v1/engine/logprobs.py +201 -0
  1313. vllm/v1/engine/output_processor.py +558 -0
  1314. vllm/v1/engine/parallel_sampling.py +133 -0
  1315. vllm/v1/engine/processor.py +524 -0
  1316. vllm/v1/engine/utils.py +857 -0
  1317. vllm/v1/executor/__init__.py +0 -0
  1318. vllm/v1/executor/abstract.py +126 -0
  1319. vllm/v1/executor/multiproc_executor.py +683 -0
  1320. vllm/v1/executor/ray_distributed_executor.py +109 -0
  1321. vllm/v1/kv_cache_interface.py +275 -0
  1322. vllm/v1/metrics/__init__.py +0 -0
  1323. vllm/v1/metrics/loggers.py +717 -0
  1324. vllm/v1/metrics/prometheus.py +82 -0
  1325. vllm/v1/metrics/ray_wrappers.py +133 -0
  1326. vllm/v1/metrics/reader.py +246 -0
  1327. vllm/v1/metrics/stats.py +248 -0
  1328. vllm/v1/outputs.py +147 -0
  1329. vllm/v1/pool/__init__.py +0 -0
  1330. vllm/v1/pool/metadata.py +77 -0
  1331. vllm/v1/request.py +237 -0
  1332. vllm/v1/sample/__init__.py +0 -0
  1333. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1334. vllm/v1/sample/logits_processor/builtin.py +273 -0
  1335. vllm/v1/sample/logits_processor/interface.py +97 -0
  1336. vllm/v1/sample/logits_processor/state.py +161 -0
  1337. vllm/v1/sample/metadata.py +43 -0
  1338. vllm/v1/sample/ops/__init__.py +0 -0
  1339. vllm/v1/sample/ops/bad_words.py +39 -0
  1340. vllm/v1/sample/ops/logprobs.py +26 -0
  1341. vllm/v1/sample/ops/penalties.py +43 -0
  1342. vllm/v1/sample/ops/topk_topp_sampler.py +254 -0
  1343. vllm/v1/sample/rejection_sampler.py +623 -0
  1344. vllm/v1/sample/sampler.py +281 -0
  1345. vllm/v1/sample/tpu/__init__.py +0 -0
  1346. vllm/v1/sample/tpu/metadata.py +124 -0
  1347. vllm/v1/sample/tpu/sampler.py +213 -0
  1348. vllm/v1/serial_utils.py +395 -0
  1349. vllm/v1/spec_decode/__init__.py +0 -0
  1350. vllm/v1/spec_decode/eagle.py +740 -0
  1351. vllm/v1/spec_decode/medusa.py +66 -0
  1352. vllm/v1/spec_decode/metadata.py +62 -0
  1353. vllm/v1/spec_decode/metrics.py +191 -0
  1354. vllm/v1/spec_decode/ngram_proposer.py +157 -0
  1355. vllm/v1/spec_decode/utils.py +14 -0
  1356. vllm/v1/structured_output/__init__.py +297 -0
  1357. vllm/v1/structured_output/backend_guidance.py +245 -0
  1358. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1359. vllm/v1/structured_output/backend_outlines.py +320 -0
  1360. vllm/v1/structured_output/backend_types.py +134 -0
  1361. vllm/v1/structured_output/backend_xgrammar.py +323 -0
  1362. vllm/v1/structured_output/request.py +86 -0
  1363. vllm/v1/structured_output/utils.py +373 -0
  1364. vllm/v1/utils.py +382 -0
  1365. vllm/v1/worker/__init__.py +0 -0
  1366. vllm/v1/worker/block_table.py +221 -0
  1367. vllm/v1/worker/cpu_model_runner.py +163 -0
  1368. vllm/v1/worker/cpu_worker.py +183 -0
  1369. vllm/v1/worker/gpu_input_batch.py +821 -0
  1370. vllm/v1/worker/gpu_model_runner.py +3743 -0
  1371. vllm/v1/worker/gpu_worker.py +697 -0
  1372. vllm/v1/worker/kv_connector_model_runner_mixin.py +122 -0
  1373. vllm/v1/worker/lora_model_runner_mixin.py +192 -0
  1374. vllm/v1/worker/tpu_input_batch.py +585 -0
  1375. vllm/v1/worker/tpu_model_runner.py +1947 -0
  1376. vllm/v1/worker/tpu_worker.py +340 -0
  1377. vllm/v1/worker/utils.py +290 -0
  1378. vllm/v1/worker/worker_base.py +65 -0
  1379. vllm/v1/worker/xpu_model_runner.py +53 -0
  1380. vllm/v1/worker/xpu_worker.py +179 -0
  1381. vllm/version.py +41 -0
  1382. vllm/vllm_flash_attn/.gitkeep +0 -0
  1383. vllm/worker/__init__.py +0 -0
  1384. vllm/worker/cache_engine.py +145 -0
  1385. vllm/worker/enc_dec_model_runner.py +553 -0
  1386. vllm/worker/model_runner.py +2016 -0
  1387. vllm/worker/model_runner_base.py +307 -0
  1388. vllm/worker/utils.py +49 -0
  1389. vllm/worker/worker.py +670 -0
  1390. vllm/worker/worker_base.py +651 -0
  1391. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/METADATA +326 -0
  1392. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/RECORD +1395 -0
  1393. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/WHEEL +5 -0
  1394. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/entry_points.txt +5 -0
  1395. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/top_level.txt +1 -0
vllm/core/scheduler.py ADDED
@@ -0,0 +1,2028 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import enum
5
+ import os
6
+ import random
7
+ import time
8
+ from collections import deque
9
+ from dataclasses import dataclass, field
10
+ from typing import Callable, Deque, Dict, Iterable, List, Optional
11
+ from typing import Sequence as GenericSequence
12
+ from typing import Set, Tuple, Union
13
+
14
+ from vllm.config import CacheConfig, SchedulerConfig
15
+ from vllm.config.lora import LoRAConfig
16
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
17
+ from vllm.logger import init_logger
18
+ from vllm.lora.request import LoRARequest
19
+ from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
20
+ SequenceGroupBase, SequenceGroupMetadata,
21
+ SequenceGroupMetadataDelta, SequenceStage,
22
+ SequenceStatus)
23
+ from vllm.utils import Device, PyObjectCache
24
+
25
+ logger = init_logger(__name__)
26
+
27
+ # Test-only. If configured, decode is preempted with
28
+ # ARTIFICIAL_PREEMPTION_PROB% probability.
29
+ ENABLE_ARTIFICIAL_PREEMPT = bool(
30
+ os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa
31
+ ARTIFICIAL_PREEMPTION_PROB = 0.5
32
+ ARTIFICIAL_PREEMPTION_MAX_CNT = 500
33
+
34
+
35
+ class PreemptionMode(enum.Enum):
36
+ """Preemption modes.
37
+
38
+ 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
39
+ and swap them back in when the sequences are resumed.
40
+ 2. Recomputation: Discard the blocks of the preempted sequences and
41
+ recompute them when the sequences are resumed, treating the sequences as
42
+ new prompts.
43
+ """
44
+
45
+ SWAP = enum.auto()
46
+ RECOMPUTE = enum.auto()
47
+
48
+
49
+ @dataclass
50
+ class SchedulingBudget:
51
+ """The available slots for scheduling.
52
+
53
+ TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
54
+ budget update from the same request_id. It is because in normal scheduling
55
+ path, we update RUNNING num_seqs ahead of time, meaning it could be
56
+ updated more than once when scheduling RUNNING requests. Since this won't
57
+ happen if we only have chunked prefill scheduling, we can remove this
58
+ feature from the API when chunked prefill is enabled by default.
59
+ """
60
+
61
+ token_budget: int
62
+ max_num_seqs: int
63
+ _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
64
+ _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
65
+ # Number of cached tokens in the batch.
66
+ _num_cached_tokens: int = 0
67
+ # Number of actual non-cached tokens in the batch.
68
+ _num_batched_tokens: int = 0
69
+ _num_curr_seqs: int = 0
70
+
71
+ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
72
+ # We allow num_new_tokens to be 0 when the entire sequence has
73
+ # been cached.
74
+ assert num_new_tokens >= 0
75
+ assert num_new_seqs != 0
76
+ return (self.num_batched_tokens + num_new_tokens <= self.token_budget
77
+ and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
78
+
79
+ def remaining_token_budget(self):
80
+ return self.token_budget - self.num_batched_tokens
81
+
82
+ def add_num_batched_tokens(self,
83
+ req_id: str,
84
+ num_batched_tokens: int,
85
+ num_cached_tokens: int = 0):
86
+ if req_id in self._request_ids_num_batched_tokens:
87
+ return
88
+ assert num_cached_tokens >= 0
89
+ assert num_batched_tokens >= 0
90
+
91
+ self._request_ids_num_batched_tokens.add(req_id)
92
+ self._num_batched_tokens += num_batched_tokens
93
+ self._num_cached_tokens += num_cached_tokens
94
+
95
+ def subtract_num_batched_tokens(self, req_id: str,
96
+ num_batched_tokens: int):
97
+ if req_id in self._request_ids_num_batched_tokens:
98
+ self._request_ids_num_batched_tokens.remove(req_id)
99
+ self._num_batched_tokens -= num_batched_tokens
100
+
101
+ def add_num_seqs(self, req_id: str, num_curr_seqs: int):
102
+ if req_id in self._request_ids_num_curr_seqs:
103
+ return
104
+
105
+ self._request_ids_num_curr_seqs.add(req_id)
106
+ self._num_curr_seqs += num_curr_seqs
107
+
108
+ def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
109
+ if req_id in self._request_ids_num_curr_seqs:
110
+ self._request_ids_num_curr_seqs.remove(req_id)
111
+ self._num_curr_seqs -= num_curr_seqs
112
+
113
+ @property
114
+ def num_batched_tokens(self):
115
+ return self._num_batched_tokens
116
+
117
+ @property
118
+ def num_curr_seqs(self):
119
+ return self._num_curr_seqs
120
+
121
+ @property
122
+ def num_cached_tokens(self):
123
+ return self._num_cached_tokens
124
+
125
+
126
+ @dataclass
127
+ class ScheduledSequenceGroup:
128
+ # A sequence group that's scheduled.
129
+ seq_group: SequenceGroup
130
+ # The total chunk size (number of tokens) to process for next iteration.
131
+ # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
132
+ # chunked, it can be smaller than that.
133
+ token_chunk_size: int
134
+
135
+
136
+ @dataclass
137
+ class SchedulerOutputs:
138
+ """The scheduling decision made from a scheduler."""
139
+
140
+ # Scheduled sequence groups.
141
+ scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
142
+ # Number of prefill groups scheduled.
143
+ num_prefill_groups: int
144
+ # Total number of batched tokens.
145
+ num_batched_tokens: int
146
+ # Blocks to swap in. List of CPU -> GPU block number.
147
+ blocks_to_swap_in: List[Tuple[int, int]]
148
+ # Blocks to swap out. List of GPU -> CPU block number.
149
+ blocks_to_swap_out: List[Tuple[int, int]]
150
+ # Blocks to copy. Source to dest block.
151
+ blocks_to_copy: List[Tuple[int, int]]
152
+ # Sequence groups that are going to be ignored.
153
+ ignored_seq_groups: List[SequenceGroup]
154
+ # The number of slots for lookahead decoding.
155
+ num_lookahead_slots: int
156
+ # The number of requests in the running queue
157
+ running_queue_size: int
158
+ preempted: int
159
+
160
+ def __post_init__(self):
161
+ # Swap in and swap out should never happen at the same time.
162
+ assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
163
+
164
+ self.num_loras: int = len(self.lora_requests)
165
+ if self.num_loras > 0:
166
+ self._sort_by_lora_ids()
167
+
168
+ def is_empty(self) -> bool:
169
+ # NOTE: We do not consider the ignored sequence groups.
170
+ return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
171
+ and not self.blocks_to_swap_out and not self.blocks_to_copy)
172
+
173
+ def _sort_by_lora_ids(self):
174
+ assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
175
+
176
+ def key_fn(group: ScheduledSequenceGroup):
177
+ key = (group.seq_group.lora_int_id, group.seq_group.request_id)
178
+ if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
179
+ # Sort sequence groups so that all prefills come before all
180
+ # decodes as required by chunked prefill.
181
+ return (not group.seq_group.is_prefill(), *key)
182
+ return key
183
+
184
+ self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
185
+ key=key_fn)
186
+
187
+ @property
188
+ def lora_requests(self) -> Set[LoRARequest]:
189
+ return {
190
+ g.seq_group.lora_request
191
+ for g in self.scheduled_seq_groups
192
+ if g.seq_group.lora_request is not None
193
+ }
194
+
195
+
196
+ @dataclass
197
+ class SchedulerRunningOutputs:
198
+ """The requests that are scheduled from a running queue.
199
+
200
+ Could contain prefill (prefill that's chunked) or decodes. If there's not
201
+ enough memory, it can be preempted (for recompute) or swapped out.
202
+ """
203
+
204
+ # Selected sequences that are running and in a decoding phase.
205
+ decode_seq_groups: List[ScheduledSequenceGroup]
206
+ # Selected sequences that are running and in a prefill phase.
207
+ # I.e., it means the prefill has been chunked.
208
+ prefill_seq_groups: List[ScheduledSequenceGroup]
209
+ # The preempted sequences.
210
+ preempted: List[SequenceGroup]
211
+ # Sequences that are swapped out.
212
+ swapped_out: List[SequenceGroup]
213
+ # The blocks to swap out.
214
+ blocks_to_swap_out: List[Tuple[int, int]]
215
+ # The blocks to copy.
216
+ blocks_to_copy: List[Tuple[int, int]]
217
+ # The number of slots for lookahead decoding.
218
+ num_lookahead_slots: int
219
+
220
+ # Optimization for fast-access to seq_group lists
221
+ decode_seq_groups_list: List[SequenceGroup]
222
+ prefill_seq_groups_list: List[SequenceGroup]
223
+
224
+ @classmethod
225
+ def create_empty(cls) -> "SchedulerRunningOutputs":
226
+ return SchedulerRunningOutputs(
227
+ decode_seq_groups=[],
228
+ prefill_seq_groups=[],
229
+ preempted=[],
230
+ swapped_out=[],
231
+ blocks_to_swap_out=[],
232
+ blocks_to_copy=[],
233
+ num_lookahead_slots=0,
234
+ decode_seq_groups_list=[],
235
+ prefill_seq_groups_list=[],
236
+ )
237
+
238
+
239
+ @dataclass
240
+ class SchedulerSwappedInOutputs:
241
+ """The requests that are scheduled from a swap queue.
242
+
243
+ Could contain prefill (prefill that's chunked) or decodes.
244
+ """
245
+
246
+ # Selected sequences that are going to be swapped in and is in a
247
+ # decoding phase.
248
+ decode_seq_groups: List[ScheduledSequenceGroup]
249
+ # Selected sequences that are going to be swapped in and in a prefill
250
+ # phase. I.e., it means the prefill has been chunked.
251
+ prefill_seq_groups: List[ScheduledSequenceGroup]
252
+ # The blocks to swap in.
253
+ blocks_to_swap_in: List[Tuple[int, int]]
254
+ # The blocks to copy.
255
+ blocks_to_copy: List[Tuple[int, int]]
256
+ # The number of slots for lookahead decoding.
257
+ num_lookahead_slots: int
258
+ # Infeasible sequence groups.
259
+ infeasible_seq_groups: List[SequenceGroup]
260
+
261
+ @classmethod
262
+ def create_empty(cls) -> "SchedulerSwappedInOutputs":
263
+ return SchedulerSwappedInOutputs(
264
+ decode_seq_groups=[],
265
+ prefill_seq_groups=[],
266
+ blocks_to_swap_in=[],
267
+ blocks_to_copy=[],
268
+ num_lookahead_slots=0,
269
+ infeasible_seq_groups=[],
270
+ )
271
+
272
+
273
+ @dataclass
274
+ class SchedulerPrefillOutputs:
275
+ """The requests that are scheduled from a waiting queue.
276
+
277
+ Could contain a fresh prefill requests or preempted requests that need
278
+ to be recomputed from scratch.
279
+ """
280
+
281
+ # Selected sequences for prefill.
282
+ seq_groups: List[ScheduledSequenceGroup]
283
+ # Ignored sequence groups.
284
+ ignored_seq_groups: List[SequenceGroup]
285
+ num_lookahead_slots: int
286
+
287
+ @classmethod
288
+ def create_empty(cls) -> "SchedulerPrefillOutputs":
289
+ return SchedulerPrefillOutputs(
290
+ seq_groups=[],
291
+ ignored_seq_groups=[],
292
+ num_lookahead_slots=0,
293
+ )
294
+
295
+
296
+ def seq_group_metadata_builder():
297
+ return SequenceGroupMetadata(request_id="",
298
+ is_prompt=False,
299
+ seq_data={},
300
+ sampling_params=None,
301
+ block_tables={})
302
+
303
+
304
+ def scheduler_running_outputs_builder():
305
+ return SchedulerRunningOutputs(decode_seq_groups=[],
306
+ prefill_seq_groups=[],
307
+ preempted=[],
308
+ swapped_out=[],
309
+ blocks_to_swap_out=[],
310
+ blocks_to_copy=[],
311
+ num_lookahead_slots=0,
312
+ prefill_seq_groups_list=[],
313
+ decode_seq_groups_list=[])
314
+
315
+
316
+ def scheduled_seq_group_builder():
317
+ return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
318
+ token_chunk_size=0)
319
+ # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
320
+
321
+
322
+ @dataclass
323
+ class PartialPrefillMetadata:
324
+ """Holds information about the partial prefills that are currently running
325
+ during a single iteration of the Scheduler.
326
+ When chunked prefill is enabled, we allow a certain number of seqs to be
327
+ partially prefilled during each iteration. Having multiple partial prefills
328
+ in flight allows us to minimize TTFT and avoid decode starvation in cases
329
+ where a single sequence group with a very large prompt blocks the queue for
330
+ too many iterations.
331
+ The number of long prefill requests is limited so that smaller
332
+ requests may jump the queue in front of them and get to the decode
333
+ phase faster.
334
+ """
335
+
336
+ # A minimum bound on the total number of prefills to be scheduled during
337
+ # this iteration
338
+ schedulable_prefills: int
339
+
340
+ # The number of long prefill requests currently running
341
+ long_prefills: int
342
+
343
+ scheduler_config: SchedulerConfig
344
+
345
+ def can_schedule(self, seq_group: SequenceGroup) -> bool:
346
+ """When concurrent partial prefills are enabled,
347
+ we limit the number of long requests and only accept
348
+ shorter requests from the queue while running them
349
+ concurrently"""
350
+ return not (seq_group.first_seq.get_num_new_tokens()
351
+ > self.scheduler_config.long_prefill_token_threshold
352
+ and self.long_prefills
353
+ >= self.scheduler_config.max_long_partial_prefills
354
+ and self.scheduler_config.max_num_partial_prefills > 1)
355
+
356
+ def maybe_increment_partial_prefills(self,
357
+ seq_group: SequenceGroup) -> None:
358
+ # When a new prefill is scheduled, we need to know if it is a
359
+ # long request
360
+ if (seq_group.first_seq.get_num_new_tokens()
361
+ > self.scheduler_config.long_prefill_token_threshold):
362
+ self.long_prefills += 1
363
+
364
+ @classmethod
365
+ def from_queues(
366
+ cls,
367
+ running: Deque[SequenceGroup],
368
+ waiting: Deque[SequenceGroup],
369
+ scheduler_config: SchedulerConfig,
370
+ ) -> "PartialPrefillMetadata":
371
+ """Create a PartialPrefillMetadata object from the current state of
372
+ the scheduler's queues.
373
+ This accounts for the currently running prefill requests, and peeks into
374
+ the waiting queue to see if there are more prefills to potentially be
375
+ scheduled during this iteration."""
376
+ prefills = 0
377
+ long_prefills = 0
378
+
379
+ waiting_long_prefills = 0
380
+
381
+ for sg in running:
382
+ if sg.first_seq.data.stage == SequenceStage.PREFILL:
383
+ prefills += 1
384
+ if (sg.first_seq.get_num_new_tokens()
385
+ > scheduler_config.long_prefill_token_threshold):
386
+ long_prefills += 1
387
+
388
+ for sg in waiting:
389
+ # Don't bother looping through the rest of the queue if we know
390
+ # there are already at
391
+ # least max_partial_prefills requests to fill
392
+ if prefills >= scheduler_config.max_num_partial_prefills:
393
+ break
394
+
395
+ # Don't count long requests from the waiting queue if we aren't
396
+ # going to schedule them anyway
397
+ if (sg.first_seq.get_num_new_tokens()
398
+ > scheduler_config.long_prefill_token_threshold):
399
+ if (long_prefills + waiting_long_prefills
400
+ >= scheduler_config.max_long_partial_prefills):
401
+ continue
402
+ waiting_long_prefills += 1
403
+ prefills += 1
404
+
405
+ # NB: long_prefills and waiting_long_prefills are tracked separately.
406
+ # We don't account for the waiting requests here because we need to use
407
+ # this metadata to track how many have actually been scheduled.
408
+ return PartialPrefillMetadata(
409
+ schedulable_prefills=min(
410
+ prefills, scheduler_config.max_num_partial_prefills),
411
+ long_prefills=long_prefills,
412
+ scheduler_config=scheduler_config,
413
+ )
414
+
415
+
416
+ class Scheduler:
417
+
418
+ def __init__(
419
+ self,
420
+ scheduler_config: SchedulerConfig,
421
+ cache_config: CacheConfig,
422
+ lora_config: Optional[LoRAConfig],
423
+ pipeline_parallel_size: int = 1,
424
+ output_proc_callback: Optional[Callable] = None,
425
+ ) -> None:
426
+ self.scheduler_config = scheduler_config
427
+ self.cache_config = cache_config
428
+ # Note for LoRA scheduling: the current policy is extremely
429
+ # simple and NOT fair. It can lead to starvation of some
430
+ # LoRAs. This should be improved in the future.
431
+ self.lora_config = lora_config
432
+
433
+ version = "selfattn"
434
+ if (self.scheduler_config.runner_type == "pooling"
435
+ or self.cache_config.is_attention_free):
436
+ version = "placeholder"
437
+
438
+ BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
439
+ version)
440
+
441
+ num_gpu_blocks = cache_config.num_gpu_blocks
442
+ if num_gpu_blocks:
443
+ num_gpu_blocks //= pipeline_parallel_size
444
+
445
+ num_cpu_blocks = cache_config.num_cpu_blocks
446
+ if num_cpu_blocks:
447
+ num_cpu_blocks //= pipeline_parallel_size
448
+
449
+ # Create the block space manager.
450
+ self.block_manager = BlockSpaceManagerImpl(
451
+ block_size=self.cache_config.block_size,
452
+ num_gpu_blocks=num_gpu_blocks,
453
+ num_cpu_blocks=num_cpu_blocks,
454
+ sliding_window=self.cache_config.sliding_window,
455
+ enable_caching=self.cache_config.enable_prefix_caching,
456
+ )
457
+
458
+ # Sequence groups in the WAITING state.
459
+ # Contain new prefill or preempted requests.
460
+ self.waiting: Deque[SequenceGroup] = deque()
461
+ # Sequence groups in the RUNNING state.
462
+ # Contain decode requests.
463
+ self.running: Deque[SequenceGroup] = deque()
464
+ # Sequence groups in the SWAPPED state.
465
+ # Contain decode requests that are swapped out.
466
+ self.swapped: Deque[SequenceGroup] = deque()
467
+ # Sequence groups finished requests ids since last step iteration.
468
+ # It lets the model know that any state associated with these requests
469
+ # can and must be released after the current step.
470
+ # This is used to evict the finished requests from the Mamba cache.
471
+ self._finished_requests_ids: List[str] = list()
472
+ # Time at previous scheduling step
473
+ self.prev_time = 0.0
474
+ # Did we schedule a prompt at previous step?
475
+ self.prev_prompt = False
476
+ # Latency of the last prompt step
477
+ self.last_prompt_latency = 0.0
478
+ # preemption mode, RECOMPUTE or SWAP
479
+ self.user_specified_preemption_mode = scheduler_config.preemption_mode
480
+
481
+ # The following field is test-only. It is used to inject artificial
482
+ # preemption.
483
+ self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
484
+ self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
485
+ if self.enable_artificial_preemption
486
+ else 0)
487
+ self.num_cumulative_preemption: int = 0
488
+
489
+ # Used to cache python objects
490
+ self._seq_group_metadata_cache: List[PyObjectCache] = []
491
+ self._scheduler_running_outputs_cache: List[PyObjectCache] = []
492
+ self._scheduled_seq_group_cache: List[PyObjectCache] = []
493
+
494
+ # For async output processing, we need to swap cache buffers between
495
+ # iterations. I.e. since the output processing is lagged one step,
496
+ # we cannot reuse the cached objects immediately when the schedule()
497
+ # is called again, but only when schedule() is called the second time.
498
+ self.output_proc_callback = output_proc_callback
499
+ self.use_async_output_proc = self.output_proc_callback is not None
500
+ self.num_cache_iters = 2 if self.use_async_output_proc else 1
501
+
502
+ self.cache_id = 0
503
+ for i in range(self.num_cache_iters):
504
+ self._seq_group_metadata_cache.append(
505
+ PyObjectCache(seq_group_metadata_builder))
506
+ self._scheduler_running_outputs_cache.append(
507
+ PyObjectCache(scheduler_running_outputs_builder))
508
+ self._scheduled_seq_group_cache.append(
509
+ PyObjectCache(scheduled_seq_group_builder))
510
+
511
+ # For async postprocessor, the extra decode run cannot be done
512
+ # when the request reaches max_model_len. In this case, the request
513
+ # will be stopped during schedule() call and added to this stop list
514
+ # for processing and deallocation by the free_finished_seq_groups()
515
+ self._async_stopped: List[SequenceGroup] = []
516
+
517
+ # List with the chunk sizes to hand out to each sequence depending
518
+ # on how many partial prefills are running. This is slightly faster than
519
+ # running an integer division every time a prefill is scheduled.
520
+ # This splits the budget evenly among all prefills.
521
+ self.partial_prefill_budget_lookup_list = [0] * (
522
+ self.scheduler_config.max_num_partial_prefills + 1)
523
+ self.partial_prefill_budget_lookup_list[0] = (
524
+ scheduler_config.max_num_batched_tokens)
525
+ for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
526
+ self.partial_prefill_budget_lookup_list[i] = (
527
+ scheduler_config.max_num_batched_tokens // i)
528
+
529
+ @property
530
+ def next_cache_id(self):
531
+ return (self.cache_id + 1) % self.num_cache_iters
532
+
533
+ @property
534
+ def lora_enabled(self) -> bool:
535
+ return bool(self.lora_config)
536
+
537
+ @property
538
+ def num_decoding_tokens_per_seq(self) -> int:
539
+ """The number of new tokens."""
540
+ return 1
541
+
542
+ def add_seq_group(self, seq_group: SequenceGroup) -> None:
543
+ # Add sequence groups to the waiting queue.
544
+ self.waiting.append(seq_group)
545
+
546
+ def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
547
+ # Add sequence groups to the running queue.
548
+ # Only for testing purposes.
549
+ self.running.append(seq_group)
550
+
551
+ def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
552
+ # Add sequence groups to the swapped queue.
553
+ # Only for testing purposes.
554
+ self.swapped.append(seq_group)
555
+
556
+ def abort_seq_group(
557
+ self,
558
+ request_id: Union[str, Iterable[str]],
559
+ seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
560
+ ) -> None:
561
+ """Aborts a sequence group with the given ID.
562
+
563
+ Check if the sequence group with the given ID
564
+ is present in any of the state queue.
565
+ If present, remove the sequence group from the state queue.
566
+ Also, if any of the sequences in the sequence group is not finished,
567
+ free the sequence with status `FINISHED_ABORTED`.
568
+ Otherwise, do nothing.
569
+
570
+ Args:
571
+ request_id: The ID(s) of the sequence group to abort.
572
+ seq_id_to_seq_group: helper for groups with n>1
573
+ """
574
+ if isinstance(request_id, str):
575
+ request_id = (request_id, )
576
+ request_ids = set(request_id)
577
+ seq_id_to_seq_group = seq_id_to_seq_group or {}
578
+ for state_queue in [self.waiting, self.running, self.swapped]:
579
+ aborted_groups: List[SequenceGroup] = []
580
+ for seq_group in state_queue:
581
+ # When n>1, seq_group.request_id looks like
582
+ # foo_parallel_sample_0, while request_ids is just foo, and we
583
+ # should resolve it as real_request_id to match.
584
+ if seq_group.request_id in seq_id_to_seq_group:
585
+ real_request_id = seq_id_to_seq_group[
586
+ seq_group.request_id].group_id
587
+ else:
588
+ real_request_id = seq_group.request_id
589
+ if real_request_id in request_ids:
590
+ # Appending aborted group into pending list.
591
+ aborted_groups.append(seq_group)
592
+ # We can't remove real_request_id in request_ids here,
593
+ # because there may be other seq groups sharing the same
594
+ # real_request_id
595
+ for aborted_group in aborted_groups:
596
+ # Remove the sequence group from the state queue.
597
+ state_queue.remove(aborted_group)
598
+ # Remove the aborted request from the Mamba cache.
599
+ self._finished_requests_ids.append(aborted_group.request_id)
600
+ for seq in aborted_group.get_seqs():
601
+ if seq.is_finished():
602
+ continue
603
+ seq.status = SequenceStatus.FINISHED_ABORTED
604
+ self.free_seq(seq)
605
+ if aborted_group.request_id in seq_id_to_seq_group:
606
+ del seq_id_to_seq_group[aborted_group.request_id]
607
+
608
+ self._free_seq_group_cross_attn_blocks(aborted_group)
609
+
610
+ def _free_seq_group_cross_attn_blocks(
611
+ self,
612
+ seq_group: SequenceGroup,
613
+ ) -> None:
614
+ """
615
+ Free a sequence group from a cross-attention block table.
616
+ Has no effect on decoder-only models.
617
+ """
618
+ if seq_group.is_encoder_decoder():
619
+ self.block_manager.free_cross(seq_group)
620
+
621
+ def has_unfinished_seqs(self) -> bool:
622
+ return (len(self.waiting) != 0 or len(self.running) != 0
623
+ or len(self.swapped) != 0)
624
+
625
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
626
+ return self.block_manager.get_prefix_cache_hit_rate(device)
627
+
628
+ def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
629
+ return self.block_manager.reset_prefix_cache(device)
630
+
631
+ def get_num_unfinished_seq_groups(self) -> int:
632
+ return len(self.waiting) + len(self.running) + len(self.swapped)
633
+
634
+ def get_and_reset_finished_requests_ids(self) -> List[str]:
635
+ """Flushes the list of request ids of previously finished seq_groups."""
636
+ finished_requests_ids = self._finished_requests_ids
637
+ self._finished_requests_ids = list()
638
+ return finished_requests_ids
639
+
640
+ def _schedule_running(
641
+ self,
642
+ budget: SchedulingBudget,
643
+ curr_loras: Optional[Set[int]],
644
+ enable_chunking: bool = False,
645
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
646
+ ) -> SchedulerRunningOutputs:
647
+ """Schedule sequence groups that are running.
648
+
649
+ Running queue should include decode and chunked prefill requests.
650
+
651
+ Args:
652
+ budget: The scheduling budget. The argument is in-place updated
653
+ when any decodes are preempted.
654
+ curr_loras: Currently batched lora request ids. The argument is
655
+ in-place updated when any decodes are preempted.
656
+ enable_chunking: If True, seq group can be chunked and only a
657
+ chunked number of tokens are scheduled if
658
+ `budget.num_batched_tokens` has not enough capacity to schedule
659
+ all tokens.
660
+ partial_prefill_metadata: information about the partial prefills
661
+ that are currently running
662
+
663
+ Returns:
664
+ SchedulerRunningOutputs.
665
+ """
666
+ ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
667
+ self.cache_id].get_object()
668
+ ret.blocks_to_swap_out.clear()
669
+ ret.blocks_to_copy.clear()
670
+ ret.decode_seq_groups.clear()
671
+ ret.prefill_seq_groups.clear()
672
+ ret.preempted.clear()
673
+ ret.swapped_out.clear()
674
+
675
+ ret.num_lookahead_slots = self._get_num_lookahead_slots(
676
+ is_prefill=False, enable_chunking=enable_chunking)
677
+
678
+ ret.decode_seq_groups_list.clear()
679
+ ret.prefill_seq_groups_list.clear()
680
+
681
+ # Blocks that need to be swapped or copied before model execution.
682
+ blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
683
+ blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
684
+
685
+ decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
686
+ prefill_seq_groups: List[
687
+ ScheduledSequenceGroup] = ret.prefill_seq_groups
688
+ preempted: List[SequenceGroup] = ret.preempted
689
+ swapped_out: List[SequenceGroup] = ret.swapped_out
690
+
691
+ running_queue = self.running
692
+ assert len(self._async_stopped) == 0
693
+ while running_queue:
694
+ seq_group = running_queue[0]
695
+ # We discard the cached tokens info here because we don't need it
696
+ # for running sequence:
697
+ # 1. If a sequence is running with chunked prefill, the cached
698
+ # tokens info was already used for the first prefill.
699
+ # 2. If a sequence is running with non-chunked prefill, then
700
+ # there it's a decoding sequence, and the cached tokens info is
701
+ # irrelevant.
702
+ num_uncached_new_tokens, _ = \
703
+ self._get_num_new_uncached_and_cached_tokens(
704
+ seq_group,
705
+ SequenceStatus.RUNNING,
706
+ enable_chunking,
707
+ budget,
708
+ partial_prefill_metadata,
709
+ )
710
+
711
+ num_running_tokens = num_uncached_new_tokens
712
+ if num_running_tokens == 0:
713
+ # No budget => Stop
714
+ break
715
+
716
+ running_queue.popleft()
717
+
718
+ # With async postprocessor, an extra decode run is done
719
+ # to process the final tokens. The check below avoids this extra
720
+ # decode run when the model max len is reached, in order to avoid
721
+ # a memory overflow.
722
+ if (self.use_async_output_proc and seq_group.seqs[0].get_len()
723
+ > self.scheduler_config.max_model_len):
724
+ self._async_stopped.append(seq_group)
725
+ continue
726
+
727
+ # NOTE(woosuk): Preemption happens only when there is no available
728
+ # slot to keep all the sequence groups in the RUNNING state.
729
+ while not self._can_append_slots(seq_group, enable_chunking):
730
+ budget.subtract_num_batched_tokens(seq_group.request_id,
731
+ num_running_tokens)
732
+ num_running_seqs = seq_group.get_max_num_running_seqs()
733
+ budget.subtract_num_seqs(seq_group.request_id,
734
+ num_running_seqs)
735
+
736
+ if (curr_loras is not None and seq_group.lora_int_id > 0
737
+ and seq_group.lora_int_id in curr_loras):
738
+ curr_loras.remove(seq_group.lora_int_id)
739
+
740
+ # Determine victim sequence
741
+ cont_loop = True
742
+ if running_queue:
743
+ # Preempt the lowest-priority sequence group.
744
+ victim_seq_group = running_queue.pop()
745
+ else:
746
+ # No other sequence group can be preempted.
747
+ # Preempt the current sequence group.
748
+ # Note: This is also where we stop this loop
749
+ # (since there is nothing else to preempt)
750
+ victim_seq_group = seq_group
751
+ cont_loop = False
752
+
753
+ # With async postprocessor, before preempting a sequence
754
+ # we need to ensure it has no pending async postprocessor
755
+ do_preempt = True
756
+ if self.use_async_output_proc:
757
+ assert self.output_proc_callback is not None
758
+ self.output_proc_callback(
759
+ request_id=victim_seq_group.request_id)
760
+
761
+ # It may be that the async pending "victim_seq_group"
762
+ # becomes finished, in which case we simply free it.
763
+ if victim_seq_group.is_finished():
764
+ self._free_finished_seq_group(victim_seq_group)
765
+ do_preempt = False
766
+
767
+ # Do preemption
768
+ if do_preempt:
769
+ preempted_mode = self._preempt(victim_seq_group,
770
+ blocks_to_swap_out)
771
+ if preempted_mode == PreemptionMode.RECOMPUTE:
772
+ preempted.append(victim_seq_group)
773
+ else:
774
+ swapped_out.append(victim_seq_group)
775
+
776
+ if not cont_loop:
777
+ break
778
+ else:
779
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
780
+ is_prefill = seq_group.is_prefill()
781
+
782
+ scheduled_seq_group: ScheduledSequenceGroup = (
783
+ self._scheduled_seq_group_cache[
784
+ self.cache_id].get_object())
785
+ scheduled_seq_group.seq_group = seq_group
786
+ if is_prefill:
787
+ scheduled_seq_group.token_chunk_size = num_running_tokens
788
+ prefill_seq_groups.append(scheduled_seq_group)
789
+ ret.prefill_seq_groups_list.append(seq_group)
790
+ else:
791
+ scheduled_seq_group.token_chunk_size = 1
792
+ decode_seq_groups.append(scheduled_seq_group)
793
+ ret.decode_seq_groups_list.append(seq_group)
794
+
795
+ budget.add_num_batched_tokens(seq_group.request_id,
796
+ num_running_tokens)
797
+ # OPTIMIZATION: Note that get_max_num_running_seqs is
798
+ # expensive. For the default scheduling chase where
799
+ # enable_chunking is False, num_seqs are updated before running
800
+ # this method, so we don't have to update it again here.
801
+ if enable_chunking:
802
+ num_running_seqs = seq_group.get_max_num_running_seqs()
803
+ budget.add_num_seqs(seq_group.request_id, num_running_seqs)
804
+ if curr_loras is not None and seq_group.lora_int_id > 0:
805
+ curr_loras.add(seq_group.lora_int_id)
806
+
807
+ self._scheduler_running_outputs_cache[self.next_cache_id].reset()
808
+ self._scheduled_seq_group_cache[self.next_cache_id].reset()
809
+
810
+ return ret
811
+
812
+ def _schedule_swapped(
813
+ self,
814
+ budget: SchedulingBudget,
815
+ curr_loras: Optional[Set[int]],
816
+ enable_chunking: bool = False,
817
+ ) -> SchedulerSwappedInOutputs:
818
+ """Schedule sequence groups that are swapped out.
819
+
820
+ It schedules swapped requests as long as it fits `budget` and
821
+ curr_loras <= max_lora from the scheduling config. The input arguments
822
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
823
+
824
+ Args:
825
+ budget: The scheduling budget. The argument is in-place updated
826
+ when any requests are swapped in.
827
+ curr_loras: Currently batched lora request ids. The argument is
828
+ in-place updated when any requests are swapped in.
829
+ enable_chunking: If True, seq group can be chunked and only a
830
+ chunked number of tokens are scheduled if
831
+ `budget.num_batched_tokens` has not enough capacity to schedule
832
+ all tokens.
833
+
834
+ Returns:
835
+ SchedulerSwappedInOutputs.
836
+ """
837
+ # Blocks that need to be swapped or copied before model execution.
838
+ blocks_to_swap_in: List[Tuple[int, int]] = []
839
+ blocks_to_copy: List[Tuple[int, int]] = []
840
+ decode_seq_groups: List[ScheduledSequenceGroup] = []
841
+ prefill_seq_groups: List[ScheduledSequenceGroup] = []
842
+ infeasible_seq_groups: List[SequenceGroup] = []
843
+
844
+ swapped_queue = self.swapped
845
+
846
+ leftover_swapped: Deque[SequenceGroup] = deque()
847
+ while swapped_queue:
848
+ seq_group = swapped_queue[0]
849
+
850
+ # If the sequence group cannot be swapped in, stop.
851
+ is_prefill = seq_group.is_prefill()
852
+ alloc_status = self.block_manager.can_swap_in(
853
+ seq_group,
854
+ self._get_num_lookahead_slots(is_prefill, enable_chunking))
855
+ if alloc_status == AllocStatus.LATER:
856
+ break
857
+ elif alloc_status == AllocStatus.NEVER:
858
+ logger.warning(
859
+ "Failing the request %s because there's not enough kv "
860
+ "cache blocks to run the entire sequence.",
861
+ seq_group.request_id,
862
+ )
863
+ for seq in seq_group.get_seqs():
864
+ seq.status = SequenceStatus.FINISHED_IGNORED
865
+ infeasible_seq_groups.append(seq_group)
866
+ swapped_queue.popleft()
867
+ continue
868
+
869
+ lora_int_id = 0
870
+ if self.lora_enabled:
871
+ lora_int_id = seq_group.lora_int_id
872
+ assert curr_loras is not None
873
+ assert self.lora_config is not None
874
+ if (lora_int_id > 0 and (lora_int_id not in curr_loras)
875
+ and len(curr_loras) >= self.lora_config.max_loras):
876
+ # We don't have a space for another LoRA, so
877
+ # we ignore this request for now.
878
+ leftover_swapped.appendleft(seq_group)
879
+ swapped_queue.popleft()
880
+ continue
881
+
882
+ # The total number of sequences in the RUNNING state should not
883
+ # exceed the maximum number of sequences.
884
+ num_new_seqs = seq_group.get_max_num_running_seqs()
885
+ num_new_tokens_uncached, num_new_tokens_cached = (
886
+ self._get_num_new_uncached_and_cached_tokens(
887
+ seq_group, SequenceStatus.SWAPPED, enable_chunking,
888
+ budget))
889
+
890
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
891
+ num_new_tokens=num_new_tokens_uncached,
892
+ num_new_seqs=num_new_seqs,
893
+ ):
894
+ self.remove_seq_from_computed_blocks_tracker(
895
+ seq_group, SequenceStatus.SWAPPED)
896
+ break
897
+
898
+ if lora_int_id > 0 and curr_loras is not None:
899
+ curr_loras.add(lora_int_id)
900
+ swapped_queue.popleft()
901
+ self._swap_in(seq_group, blocks_to_swap_in)
902
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
903
+ if is_prefill:
904
+ prefill_seq_groups.append(
905
+ ScheduledSequenceGroup(
906
+ seq_group,
907
+ token_chunk_size=num_new_tokens_uncached +
908
+ num_new_tokens_cached,
909
+ ))
910
+ else:
911
+ decode_seq_groups.append(
912
+ ScheduledSequenceGroup(seq_group, token_chunk_size=1))
913
+ budget.add_num_batched_tokens(
914
+ seq_group.request_id,
915
+ num_batched_tokens=num_new_tokens_uncached,
916
+ num_cached_tokens=num_new_tokens_cached,
917
+ )
918
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
919
+
920
+ swapped_queue.extendleft(leftover_swapped)
921
+
922
+ return SchedulerSwappedInOutputs(
923
+ decode_seq_groups=decode_seq_groups,
924
+ prefill_seq_groups=prefill_seq_groups,
925
+ blocks_to_swap_in=blocks_to_swap_in,
926
+ blocks_to_copy=blocks_to_copy,
927
+ num_lookahead_slots=self._get_num_lookahead_slots(
928
+ is_prefill=False, enable_chunking=enable_chunking),
929
+ infeasible_seq_groups=infeasible_seq_groups,
930
+ )
931
+
932
+ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
933
+ if self.scheduler_config.chunked_prefill_enabled:
934
+ prompt_limit = self.scheduler_config.max_model_len
935
+ else:
936
+ prompt_limit = min(
937
+ self.scheduler_config.max_model_len,
938
+ self.scheduler_config.max_num_batched_tokens,
939
+ )
940
+
941
+ # Model is fine tuned with long context. Return the fine tuned max_len.
942
+ if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
943
+ assert prompt_limit <= seq_group.lora_request.long_lora_max_len
944
+ return seq_group.lora_request.long_lora_max_len
945
+ else:
946
+ return prompt_limit
947
+
948
+ def _get_priority(self,
949
+ seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
950
+ """Get the priority of the sequence group.
951
+ Highest preference to user-defined priority, followed by arrival time.
952
+ Args:
953
+ seq_group: The sequence group input.
954
+ Returns:
955
+ The priority of the sequence group.
956
+ """
957
+ return seq_group.priority, seq_group.arrival_time
958
+
959
+ def _schedule_priority_preemption(
960
+ self,
961
+ budget: SchedulingBudget,
962
+ ) -> int:
963
+ """Sorts waiting and running queue. Also, force preempt requests
964
+ from the running queue if their priority is lower.
965
+ Priority-based preemption is used with the priority policy.
966
+ Args:
967
+ budget: The scheduling budget. The argument is in-place updated
968
+ when any requests are scheduled.
969
+ Returns:
970
+ A count of priority-based preemptions.
971
+ """
972
+
973
+ waiting_queue = self.waiting
974
+
975
+ running_queue = deque(sorted(self.running, key=self._get_priority))
976
+
977
+ blocks_to_swap_out: List[Tuple[int, int]] = []
978
+ force_preemption_count = 0
979
+
980
+ if waiting_queue:
981
+ seq_group = waiting_queue.popleft()
982
+ num_new_seqs = seq_group.get_max_num_running_seqs()
983
+ num_new_tokens_uncached, _ = \
984
+ self._get_num_new_uncached_and_cached_tokens(
985
+ seq_group, SequenceStatus.WAITING, False, budget)
986
+
987
+ # Only preempt if priority inversion exists
988
+ while running_queue and self._get_priority(
989
+ running_queue[-1]) > self._get_priority(seq_group):
990
+ # Only preempt if waiting sequence cannot be allocated
991
+ can_allocate = self.block_manager.can_allocate(seq_group)
992
+ if (num_new_tokens_uncached > 0
993
+ and can_allocate == AllocStatus.OK
994
+ and budget.can_schedule(
995
+ num_new_tokens=num_new_tokens_uncached,
996
+ num_new_seqs=num_new_seqs,
997
+ )):
998
+ break
999
+
1000
+ # Adjust budget to remove the victim sequence group
1001
+ vseq_group = running_queue.pop()
1002
+ num_running_tokens_uncached, _ = (
1003
+ self._get_num_new_uncached_and_cached_tokens(
1004
+ vseq_group, SequenceStatus.RUNNING, False, budget))
1005
+ budget.subtract_num_batched_tokens(
1006
+ vseq_group.request_id, num_running_tokens_uncached)
1007
+ num_running_seqs = vseq_group.get_max_num_running_seqs()
1008
+ budget.subtract_num_seqs(vseq_group.request_id,
1009
+ num_running_seqs)
1010
+
1011
+ # Preempt out the victim sequence group
1012
+ self._preempt(vseq_group, blocks_to_swap_out)
1013
+ waiting_queue.appendleft(vseq_group)
1014
+ force_preemption_count += 1
1015
+ # Put the sequence back into the waiting queue
1016
+ waiting_queue.appendleft(seq_group)
1017
+
1018
+ self.remove_seq_from_computed_blocks_tracker(
1019
+ seq_group, SequenceStatus.WAITING)
1020
+
1021
+ waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
1022
+
1023
+ self.waiting = waiting_queue
1024
+ self.running = running_queue
1025
+ return force_preemption_count
1026
+
1027
+ def _schedule_prefills(
1028
+ self,
1029
+ budget: SchedulingBudget,
1030
+ curr_loras: Optional[Set[int]],
1031
+ enable_chunking: bool = False,
1032
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1033
+ ) -> SchedulerPrefillOutputs:
1034
+ """Schedule sequence groups that are in prefill stage.
1035
+
1036
+ Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
1037
+ as a new prefill (that starts from beginning -> most recently generated
1038
+ tokens).
1039
+
1040
+ It schedules waiting requests as long as it fits `budget` and
1041
+ curr_loras <= max_lora from the scheduling config. The input arguments
1042
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
1043
+
1044
+ Args:
1045
+ budget: The scheduling budget. The argument is in-place updated
1046
+ when any requests are scheduled.
1047
+ curr_loras: Currently batched lora request ids. The argument is
1048
+ in-place updated when any requests are scheduled.
1049
+ enable_chunking: If True, seq group can be chunked and only a
1050
+ chunked number of tokens are scheduled if
1051
+ `budget.num_batched_tokens` has not enough capacity to schedule
1052
+ all tokens.
1053
+ partial_prefill_metadata: information about the partial prefills
1054
+ that are currently running
1055
+
1056
+ Returns:
1057
+ SchedulerPrefillOutputs.
1058
+ """
1059
+ if budget.remaining_token_budget() == 0:
1060
+ # Do nothing: Can't add any more prefill anyway
1061
+ return SchedulerPrefillOutputs(
1062
+ seq_groups=[],
1063
+ ignored_seq_groups=[],
1064
+ num_lookahead_slots=self._get_num_lookahead_slots(
1065
+ is_prefill=True, enable_chunking=enable_chunking),
1066
+ )
1067
+ ignored_seq_groups: List[SequenceGroup] = []
1068
+ seq_groups: List[ScheduledSequenceGroup] = []
1069
+ using_prompt_embeds: bool = False
1070
+
1071
+ waiting_queue = self.waiting
1072
+
1073
+ leftover_waiting_sequences: Deque[SequenceGroup] = deque()
1074
+ while self._passed_delay(time.time()) and waiting_queue:
1075
+ seq_group = waiting_queue[0]
1076
+
1077
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
1078
+ assert len(waiting_seqs) == 1, (
1079
+ "Waiting sequence group should have only one prompt "
1080
+ "sequence.")
1081
+ if (partial_prefill_metadata is not None
1082
+ and not partial_prefill_metadata.can_schedule(seq_group)):
1083
+ leftover_waiting_sequences.appendleft(seq_group)
1084
+ waiting_queue.popleft()
1085
+ continue
1086
+ num_new_tokens_uncached, num_new_tokens_cached = (
1087
+ self._get_num_new_uncached_and_cached_tokens(
1088
+ seq_group,
1089
+ SequenceStatus.WAITING,
1090
+ enable_chunking,
1091
+ budget,
1092
+ partial_prefill_metadata=partial_prefill_metadata,
1093
+ ))
1094
+ num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
1095
+
1096
+ if not enable_chunking:
1097
+ num_prompt_tokens = waiting_seqs[0].get_len()
1098
+ assert num_new_tokens == num_prompt_tokens
1099
+
1100
+ prompt_limit = self._get_prompt_limit(seq_group)
1101
+ if num_new_tokens > prompt_limit:
1102
+ logger.warning(
1103
+ "Input prompt (%d tokens) is too long"
1104
+ " and exceeds limit of %d",
1105
+ num_new_tokens,
1106
+ prompt_limit,
1107
+ )
1108
+ for seq in waiting_seqs:
1109
+ seq.status = SequenceStatus.FINISHED_IGNORED
1110
+ self.remove_seq_from_computed_blocks_tracker(
1111
+ seq_group, SequenceStatus.FINISHED_IGNORED)
1112
+ ignored_seq_groups.append(seq_group)
1113
+ waiting_queue.popleft()
1114
+ continue
1115
+
1116
+ num_lookahead_slots: int = 0
1117
+
1118
+ # If the sequence group cannot be allocated, stop.
1119
+ can_allocate = self.block_manager.can_allocate(
1120
+ seq_group, num_lookahead_slots=num_lookahead_slots)
1121
+ if can_allocate == AllocStatus.LATER:
1122
+ self.remove_seq_from_computed_blocks_tracker(
1123
+ seq_group, SequenceStatus.WAITING)
1124
+ break
1125
+ elif can_allocate == AllocStatus.NEVER:
1126
+ logger.warning(
1127
+ "Input prompt (%d tokens) + lookahead slots (%d) is "
1128
+ "too long and exceeds the capacity of block_manager",
1129
+ num_new_tokens,
1130
+ num_lookahead_slots,
1131
+ )
1132
+ for seq in waiting_seqs:
1133
+ seq.status = SequenceStatus.FINISHED_IGNORED
1134
+ self.remove_seq_from_computed_blocks_tracker(
1135
+ seq_group, SequenceStatus.FINISHED_IGNORED)
1136
+ ignored_seq_groups.append(seq_group)
1137
+ waiting_queue.popleft()
1138
+ continue
1139
+
1140
+ # We cannot mix sequence groups that use prompt embeds and
1141
+ # those that do not.
1142
+ if len(seq_groups) == 0:
1143
+ using_prompt_embeds = seq_group.uses_prompt_embeds()
1144
+ if using_prompt_embeds != seq_group.uses_prompt_embeds():
1145
+ self.remove_seq_from_computed_blocks_tracker(
1146
+ seq_group, SequenceStatus.WAITING)
1147
+ leftover_waiting_sequences.appendleft(seq_group)
1148
+ waiting_queue.popleft()
1149
+ continue
1150
+
1151
+ lora_int_id = 0
1152
+ if self.lora_enabled:
1153
+ lora_int_id = seq_group.lora_int_id
1154
+ assert curr_loras is not None
1155
+ assert self.lora_config is not None
1156
+ if (self.lora_enabled and lora_int_id > 0
1157
+ and lora_int_id not in curr_loras
1158
+ and len(curr_loras) >= self.lora_config.max_loras):
1159
+ # We don't have a space for another LoRA, so
1160
+ # we ignore this request for now.
1161
+ self.remove_seq_from_computed_blocks_tracker(
1162
+ seq_group, SequenceStatus.WAITING)
1163
+ leftover_waiting_sequences.appendleft(seq_group)
1164
+ waiting_queue.popleft()
1165
+ continue
1166
+
1167
+ if (budget.num_batched_tokens
1168
+ >= self.scheduler_config.max_num_batched_tokens):
1169
+ # We've reached the budget limit - since there might be
1170
+ # continuous prefills in the running queue, we should break
1171
+ # to avoid scheduling any new prefills.
1172
+ self.remove_seq_from_computed_blocks_tracker(
1173
+ seq_group, SequenceStatus.WAITING)
1174
+ break
1175
+
1176
+ num_new_seqs = seq_group.get_max_num_running_seqs()
1177
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
1178
+ num_new_tokens=num_new_tokens_uncached,
1179
+ num_new_seqs=num_new_seqs,
1180
+ ):
1181
+ self.remove_seq_from_computed_blocks_tracker(
1182
+ seq_group, SequenceStatus.WAITING)
1183
+ break
1184
+
1185
+ # Can schedule this request.
1186
+ if curr_loras is not None and lora_int_id > 0:
1187
+ curr_loras.add(lora_int_id)
1188
+ waiting_queue.popleft()
1189
+ self._allocate_and_set_running(seq_group)
1190
+
1191
+ if partial_prefill_metadata is not None:
1192
+ partial_prefill_metadata.maybe_increment_partial_prefills(
1193
+ seq_group)
1194
+
1195
+ seq_groups.append(
1196
+ ScheduledSequenceGroup(seq_group=seq_group,
1197
+ token_chunk_size=num_new_tokens))
1198
+ budget.add_num_batched_tokens(
1199
+ seq_group.request_id,
1200
+ num_batched_tokens=num_new_tokens_uncached,
1201
+ num_cached_tokens=num_new_tokens_cached,
1202
+ )
1203
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
1204
+
1205
+ # Queue requests that couldn't be scheduled.
1206
+ waiting_queue.extendleft(leftover_waiting_sequences)
1207
+ if len(seq_groups) > 0:
1208
+ self.prev_prompt = True
1209
+
1210
+ return SchedulerPrefillOutputs(
1211
+ seq_groups=seq_groups,
1212
+ ignored_seq_groups=ignored_seq_groups,
1213
+ num_lookahead_slots=self._get_num_lookahead_slots(
1214
+ is_prefill=True, enable_chunking=enable_chunking),
1215
+ )
1216
+
1217
+ def _schedule_default(self) -> SchedulerOutputs:
1218
+ """Schedule queued requests.
1219
+
1220
+ The current policy is designed to optimize the throughput. First,
1221
+ it batches as many prefill requests as possible. And it schedules
1222
+ decodes. If there's a pressure on GPU memory, decode requests can
1223
+ be swapped or preempted.
1224
+ """
1225
+ # Include running requests to the budget.
1226
+ budget = SchedulingBudget(
1227
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1228
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1229
+ )
1230
+ # Make sure we include num running seqs before scheduling prefill,
1231
+ # so that we don't schedule beyond max_num_seqs for prefill.
1232
+ for seq_group in self.running:
1233
+ budget.add_num_seqs(seq_group.request_id,
1234
+ seq_group.get_max_num_running_seqs())
1235
+ curr_loras = (set(
1236
+ seq_group.lora_int_id for seq_group in self.running
1237
+ if seq_group.lora_int_id > 0) if self.lora_enabled else None)
1238
+
1239
+ prefills = SchedulerPrefillOutputs.create_empty()
1240
+ running_scheduled = SchedulerRunningOutputs.create_empty()
1241
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1242
+
1243
+ # If any requests are swapped, prioritized swapped requests.
1244
+ if not self.swapped:
1245
+ prefills = self._schedule_prefills(budget,
1246
+ curr_loras,
1247
+ enable_chunking=False)
1248
+
1249
+ if len(prefills.seq_groups
1250
+ ) == 0 and self.scheduler_config.policy == "priority":
1251
+ self._schedule_priority_preemption(budget)
1252
+
1253
+ # Don't schedule decodes if prefills are scheduled.
1254
+ # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
1255
+ # only contains decode requests, not chunked prefills.
1256
+ if len(prefills.seq_groups) == 0:
1257
+ running_scheduled = self._schedule_running(budget,
1258
+ curr_loras,
1259
+ enable_chunking=False)
1260
+
1261
+ # If any sequence group is preempted, do not swap in any sequence
1262
+ # group. because it means there's no slot for new running requests.
1263
+ if (len(running_scheduled.preempted) +
1264
+ len(running_scheduled.swapped_out) == 0):
1265
+ swapped_in = \
1266
+ self._schedule_swapped(budget, curr_loras)
1267
+
1268
+ assert (budget.num_batched_tokens
1269
+ <= self.scheduler_config.max_num_batched_tokens)
1270
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1271
+
1272
+ # Update waiting requests.
1273
+ self.waiting.extendleft(running_scheduled.preempted)
1274
+ # Update new running requests.
1275
+ if len(prefills.seq_groups) > 0:
1276
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1277
+
1278
+ self.running.extend(running_scheduled.decode_seq_groups_list)
1279
+
1280
+ if len(swapped_in.decode_seq_groups) > 0:
1281
+ self.running.extend(
1282
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1283
+
1284
+ # Update swapped requests.
1285
+ self.swapped.extend(running_scheduled.swapped_out)
1286
+ preempted = len(running_scheduled.preempted) + len(
1287
+ running_scheduled.swapped_out)
1288
+
1289
+ # There should be no prefill from running queue because this policy
1290
+ # doesn't allow chunked prefills.
1291
+ assert len(running_scheduled.prefill_seq_groups) == 0
1292
+ assert len(swapped_in.prefill_seq_groups) == 0
1293
+
1294
+ # Merge lists
1295
+ num_prefill_groups = len(prefills.seq_groups)
1296
+ ignored_seq_groups_for_embeds = list[SequenceGroup]()
1297
+ if num_prefill_groups > 0:
1298
+ scheduled_seq_groups = prefills.seq_groups
1299
+ scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
1300
+ ignored_seq_groups_for_embeds.clear()
1301
+ else:
1302
+ scheduled_seq_groups = running_scheduled.decode_seq_groups
1303
+ if len(scheduled_seq_groups) > 0:
1304
+ using_prompt_embeds = scheduled_seq_groups[
1305
+ 0].seq_group.uses_prompt_embeds()
1306
+ ignored_seq_groups_for_embeds.clear()
1307
+ indices_ignored = list[int]()
1308
+ for i, schedule_seq_group in enumerate(scheduled_seq_groups):
1309
+ if using_prompt_embeds !=\
1310
+ schedule_seq_group.seq_group.uses_prompt_embeds():
1311
+ ignored_seq_groups_for_embeds.append(
1312
+ schedule_seq_group.seq_group)
1313
+ indices_ignored.append(i)
1314
+ if len(ignored_seq_groups_for_embeds) > 0:
1315
+ scheduled_seq_groups = [
1316
+ group for i, group in enumerate(scheduled_seq_groups)
1317
+ if i not in indices_ignored
1318
+ ]
1319
+ else:
1320
+ ignored_seq_groups_for_embeds.clear()
1321
+
1322
+ scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
1323
+
1324
+ blocks_to_copy = running_scheduled.blocks_to_copy
1325
+ blocks_to_copy.extend(swapped_in.blocks_to_copy)
1326
+
1327
+ ignored_seq_groups = prefills.ignored_seq_groups
1328
+ ignored_seq_groups.extend(ignored_seq_groups_for_embeds)
1329
+ ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
1330
+
1331
+ return SchedulerOutputs(
1332
+ scheduled_seq_groups=scheduled_seq_groups,
1333
+ num_prefill_groups=num_prefill_groups,
1334
+ num_batched_tokens=budget.num_batched_tokens +
1335
+ budget.num_cached_tokens,
1336
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1337
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1338
+ blocks_to_copy=blocks_to_copy,
1339
+ ignored_seq_groups=ignored_seq_groups,
1340
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
1341
+ running_queue_size=len(self.running),
1342
+ preempted=preempted,
1343
+ )
1344
+
1345
+ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
1346
+ """Schedule queued requests.
1347
+
1348
+ Chunked prefill allows to chunk prefill requests, batch them together
1349
+ with decode requests. This policy 1. schedule as many decoding requests
1350
+ as possible. 2. schedule chunked prefill requests that are not
1351
+ finished. 3. schedule swapped request. 4. schedule new prefill
1352
+ requests.
1353
+
1354
+ The policy can sustain the high GPU utilization because it can put
1355
+ prefill and decodes requests to the same batch, while it improves
1356
+ inter token latency because decodes requests don't need to be blocked
1357
+ by prefill requests.
1358
+ """
1359
+ budget = SchedulingBudget(
1360
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1361
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1362
+ )
1363
+ curr_loras: Set[int] = set()
1364
+
1365
+ prefills = SchedulerPrefillOutputs.create_empty()
1366
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1367
+
1368
+ # Create partial prefill metadata
1369
+ partial_prefill_metadata = PartialPrefillMetadata.from_queues(
1370
+ running=self.running,
1371
+ waiting=self.waiting,
1372
+ scheduler_config=self.scheduler_config,
1373
+ )
1374
+
1375
+ # Decoding should be always scheduled first by fcfs.
1376
+ running_scheduled = self._schedule_running(
1377
+ budget,
1378
+ curr_loras,
1379
+ enable_chunking=True,
1380
+ partial_prefill_metadata=partial_prefill_metadata,
1381
+ )
1382
+
1383
+ # Schedule swapped out requests.
1384
+ # If preemption happens, it means we don't have space for swap-in.
1385
+ if len(running_scheduled.preempted) + len(
1386
+ running_scheduled.swapped_out) == 0:
1387
+ swapped_in = self._schedule_swapped(budget, curr_loras)
1388
+
1389
+ prefills = self._schedule_prefills(
1390
+ budget,
1391
+ curr_loras,
1392
+ enable_chunking=True,
1393
+ partial_prefill_metadata=partial_prefill_metadata,
1394
+ )
1395
+
1396
+ assert (budget.num_batched_tokens
1397
+ <= self.scheduler_config.max_num_batched_tokens)
1398
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1399
+
1400
+ # Update waiting requests.
1401
+ self.waiting.extendleft(running_scheduled.preempted)
1402
+
1403
+ # Update new running requests.
1404
+ # By default, vLLM scheduler prioritizes prefills.
1405
+ # Once chunked prefill is enabled,
1406
+ # the policy is changed to prioritize decode requests.
1407
+ self.running.extend(
1408
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1409
+ self.running.extend(
1410
+ [s.seq_group for s in swapped_in.prefill_seq_groups])
1411
+ self.running.extend(
1412
+ [s.seq_group for s in running_scheduled.decode_seq_groups])
1413
+ # Because multiple prefills may be running concurrently, we need to
1414
+ # make sure that prefills which are scheduled to finish are listed
1415
+ # before those that won't. This is so that on the next scheduling
1416
+ # iteration when they have transitioned to the decode stage, they are
1417
+ # properly prioritized over sequences that are still in the prefill
1418
+ # stage.
1419
+ self.running.extend(
1420
+ self._order_finishing_prefills_first(
1421
+ running_scheduled.prefill_seq_groups))
1422
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1423
+
1424
+ # Update swapped requests.
1425
+ self.swapped.extend(running_scheduled.swapped_out)
1426
+ # Put prefills first due to Attention backend ordering assumption.
1427
+ scheduled_seq_groups = (prefills.seq_groups +
1428
+ running_scheduled.prefill_seq_groups +
1429
+ swapped_in.prefill_seq_groups +
1430
+ running_scheduled.decode_seq_groups +
1431
+ swapped_in.decode_seq_groups)
1432
+ num_prefill_groups = (len(prefills.seq_groups) +
1433
+ len(swapped_in.prefill_seq_groups) +
1434
+ len(running_scheduled.prefill_seq_groups))
1435
+ return SchedulerOutputs(
1436
+ scheduled_seq_groups=scheduled_seq_groups,
1437
+ num_prefill_groups=num_prefill_groups,
1438
+ num_batched_tokens=budget.num_batched_tokens +
1439
+ budget.num_cached_tokens,
1440
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1441
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1442
+ blocks_to_copy=running_scheduled.blocks_to_copy +
1443
+ swapped_in.blocks_to_copy,
1444
+ ignored_seq_groups=prefills.ignored_seq_groups +
1445
+ swapped_in.infeasible_seq_groups,
1446
+ num_lookahead_slots=0,
1447
+ running_queue_size=len(self.running),
1448
+ preempted=(len(running_scheduled.preempted) +
1449
+ len(running_scheduled.swapped_out)),
1450
+ )
1451
+
1452
+ def _order_finishing_prefills_first(
1453
+ self, scheduled_prefill_seqs: List[ScheduledSequenceGroup]
1454
+ ) -> List[SequenceGroup]:
1455
+ """Returns a list of prefilling SequenceGroups where sequences that are
1456
+ scheduled to finish prefilling are listed first"""
1457
+ finishing = [
1458
+ s.seq_group for s in scheduled_prefill_seqs
1459
+ if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size
1460
+ ]
1461
+ not_finishing = [
1462
+ s.seq_group for s in scheduled_prefill_seqs
1463
+ if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size
1464
+ ]
1465
+ return finishing + not_finishing
1466
+
1467
+ def _schedule(self) -> SchedulerOutputs:
1468
+ """Schedule queued requests."""
1469
+ if self.scheduler_config.chunked_prefill_enabled:
1470
+ return self._schedule_chunked_prefill()
1471
+ else:
1472
+ return self._schedule_default()
1473
+
1474
+ def _can_append_slots(self, seq_group: SequenceGroup,
1475
+ enable_chunking: bool) -> bool:
1476
+ """Determine whether or not we have enough space in the KV cache to
1477
+ continue generation of the sequence group.
1478
+ """
1479
+ # It is True only for testing case to trigger artificial preemption.
1480
+ if (self.enable_artificial_preemption
1481
+ and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
1482
+ and self.artificial_preempt_cnt > 0):
1483
+ self.artificial_preempt_cnt -= 1
1484
+ return False
1485
+
1486
+ is_prefill = seq_group.is_prefill()
1487
+ num_lookahead_slots = self._get_num_lookahead_slots(
1488
+ is_prefill, enable_chunking)
1489
+
1490
+ return self.block_manager.can_append_slots(
1491
+ seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
1492
+
1493
+ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
1494
+ # async_output_proc is allowed only when we have a single sequence
1495
+ # in the sequence group
1496
+ no_single_seq = seq_group.sampling_params is None or (
1497
+ seq_group.sampling_params.n == 1)
1498
+ return no_single_seq
1499
+
1500
+ def schedule(
1501
+ self
1502
+ ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
1503
+ # Schedule sequence groups.
1504
+ # This function call changes the internal states of the scheduler
1505
+ # such as self.running, self.swapped, and self.waiting.
1506
+ scheduler_start_time = time.perf_counter()
1507
+
1508
+ scheduler_outputs: SchedulerOutputs = self._schedule()
1509
+ now = time.time()
1510
+
1511
+ if not self.cache_config.enable_prefix_caching:
1512
+ common_computed_block_nums = []
1513
+
1514
+ allow_async_output_proc: bool = self.use_async_output_proc
1515
+
1516
+ # Create input data structures.
1517
+ seq_group_metadata_list: List[SequenceGroupMetadata] = []
1518
+ for i, scheduled_seq_group in enumerate(
1519
+ scheduler_outputs.scheduled_seq_groups):
1520
+ seq_group = scheduled_seq_group.seq_group
1521
+ token_chunk_size = scheduled_seq_group.token_chunk_size
1522
+ seq_group.maybe_set_first_scheduled_time(now)
1523
+
1524
+ seq_group_metadata = self._seq_group_metadata_cache[
1525
+ self.cache_id].get_object()
1526
+ seq_group_metadata.seq_data.clear()
1527
+ seq_group_metadata.block_tables.clear()
1528
+
1529
+ # seq_id -> SequenceData
1530
+ seq_data: Dict[int, SequenceData] = {}
1531
+ # seq_id -> physical block numbers
1532
+ block_tables: Dict[int, List[int]] = {}
1533
+
1534
+ if seq_group.is_encoder_decoder():
1535
+ # Encoder associated with SequenceGroup
1536
+ encoder_seq = seq_group.get_encoder_seq()
1537
+ assert encoder_seq is not None
1538
+ encoder_seq_data = encoder_seq.data
1539
+ # Block table for cross-attention
1540
+ # Also managed at SequenceGroup level
1541
+ cross_block_table = self.block_manager.get_cross_block_table(
1542
+ seq_group)
1543
+ else:
1544
+ encoder_seq_data = None
1545
+ cross_block_table = None
1546
+
1547
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1548
+ seq_id = seq.seq_id
1549
+ seq_data[seq_id] = seq.data
1550
+ block_tables[seq_id] = self.block_manager.get_block_table(seq)
1551
+ self.block_manager.access_all_blocks_in_seq(seq, now)
1552
+
1553
+ if self.cache_config.enable_prefix_caching:
1554
+ common_computed_block_nums = (
1555
+ self.block_manager.get_common_computed_block_ids(
1556
+ seq_group.get_seqs(status=SequenceStatus.RUNNING)))
1557
+
1558
+ do_sample = True
1559
+ is_prompt = seq_group.is_prefill()
1560
+ # We should send the metadata to workers when the first prefill
1561
+ # is sent. Subsequent requests could be chunked prefill or decode.
1562
+ is_first_prefill = False
1563
+ if is_prompt:
1564
+ seqs = seq_group.get_seqs()
1565
+ # Prefill has only 1 sequence.
1566
+ assert len(seqs) == 1
1567
+ num_computed_tokens = seqs[0].data.get_num_computed_tokens()
1568
+ is_first_prefill = num_computed_tokens == 0
1569
+ # In the next iteration, all prompt tokens are not computed.
1570
+ # It means the prefill is chunked, and we don't need sampling.
1571
+ # NOTE: We use get_len instead of get_prompt_len because when
1572
+ # a sequence is preempted, prefill includes previous generated
1573
+ # output tokens.
1574
+ if (token_chunk_size + num_computed_tokens
1575
+ < seqs[0].data.get_len()):
1576
+ do_sample = False
1577
+
1578
+ # It assumes the scheduled_seq_groups is ordered by
1579
+ # prefill < decoding.
1580
+ if is_first_prefill or not self.scheduler_config.send_delta_data:
1581
+ seq_group_metadata = SequenceGroupMetadata(
1582
+ request_id=seq_group.request_id,
1583
+ is_prompt=is_prompt,
1584
+ seq_data=seq_data,
1585
+ sampling_params=seq_group.sampling_params,
1586
+ block_tables=block_tables,
1587
+ do_sample=do_sample,
1588
+ pooling_params=seq_group.pooling_params,
1589
+ token_chunk_size=token_chunk_size,
1590
+ lora_request=seq_group.lora_request,
1591
+ computed_block_nums=common_computed_block_nums,
1592
+ encoder_seq_data=encoder_seq_data,
1593
+ cross_block_table=cross_block_table,
1594
+ state=seq_group.state,
1595
+ # `multi_modal_data` will only be present for the 1st comm
1596
+ # between engine and worker.
1597
+ # the subsequent comms can still use delta, but
1598
+ # `multi_modal_data` will be None.
1599
+ multi_modal_data=(seq_group.multi_modal_data
1600
+ if scheduler_outputs.num_prefill_groups
1601
+ > 0 else None),
1602
+ multi_modal_placeholders=(
1603
+ seq_group.multi_modal_placeholders
1604
+ if scheduler_outputs.num_prefill_groups > 0 else None),
1605
+ )
1606
+ else:
1607
+ # When SPMD mode is enabled, we only send delta data except for
1608
+ # the first request to reduce serialization cost.
1609
+ seq_data_delta = {}
1610
+ for id, data in seq_data.items():
1611
+ seq_data_delta[id] = data.get_delta_and_reset()
1612
+ seq_group_metadata = SequenceGroupMetadataDelta(
1613
+ seq_data_delta,
1614
+ seq_group.request_id,
1615
+ block_tables,
1616
+ is_prompt,
1617
+ do_sample=do_sample,
1618
+ token_chunk_size=token_chunk_size,
1619
+ computed_block_nums=common_computed_block_nums,
1620
+ )
1621
+ seq_group_metadata_list.append(seq_group_metadata)
1622
+
1623
+ if allow_async_output_proc:
1624
+ allow_async_output_proc = self._allow_async_output_proc(
1625
+ seq_group)
1626
+
1627
+ # Now that the batch has been created, we can assume all blocks in the
1628
+ # batch will have been computed before the next scheduling invocation.
1629
+ # This is because the engine assumes that a failure in model execution
1630
+ # will crash the vLLM instance / will not retry.
1631
+ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
1632
+ self.block_manager.mark_blocks_as_computed(
1633
+ scheduled_seq_group.seq_group,
1634
+ scheduled_seq_group.token_chunk_size)
1635
+
1636
+ self._seq_group_metadata_cache[self.next_cache_id].reset()
1637
+
1638
+ scheduler_time = time.perf_counter() - scheduler_start_time
1639
+ # Add this to scheduler time to all the sequences that are currently
1640
+ # running. This will help estimate if the scheduler is a significant
1641
+ # component in the e2e latency.
1642
+ for seq_group in self.running:
1643
+ if seq_group is not None and seq_group.metrics is not None:
1644
+ if seq_group.metrics.scheduler_time is not None:
1645
+ seq_group.metrics.scheduler_time += scheduler_time
1646
+ else:
1647
+ seq_group.metrics.scheduler_time = scheduler_time
1648
+
1649
+ # Move to next cache (if exists)
1650
+ self.cache_id = self.next_cache_id
1651
+
1652
+ # Return results
1653
+ return (seq_group_metadata_list, scheduler_outputs,
1654
+ allow_async_output_proc)
1655
+
1656
+ def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
1657
+ self.block_manager.fork(parent_seq, child_seq)
1658
+
1659
+ def free_seq(self, seq: Sequence) -> None:
1660
+ """Free a sequence from a block table."""
1661
+ self.block_manager.free(seq)
1662
+
1663
+ def remove_seq_from_computed_blocks_tracker(
1664
+ self, seq_group: SequenceGroup,
1665
+ status: Optional[SequenceStatus]) -> None:
1666
+ seqs = seq_group.get_seqs(status=status)
1667
+ for seq in seqs:
1668
+ self._remove_seq_from_computed_blocks_tracker(seq)
1669
+
1670
+ def _remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
1671
+ """
1672
+ Free a sequence computed blocks tracker _seq_id_to_blocks_hashes
1673
+ and _seq_id_to_num_tokens_computed.
1674
+ """
1675
+ self.block_manager.remove_seq_from_computed_blocks_tracker(seq)
1676
+
1677
+ def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
1678
+ """Free finished seqs in a sequence group."""
1679
+ for seq in seq_group.get_seqs():
1680
+ if seq.is_finished():
1681
+ self.free_seq(seq)
1682
+
1683
+ def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
1684
+ if seq_group.is_finished():
1685
+ # Free cross-attention block table, if it exists
1686
+ self._free_seq_group_cross_attn_blocks(seq_group)
1687
+
1688
+ # Add the finished requests to the finished requests list.
1689
+ # This list will be used to update the Mamba cache in the
1690
+ # next step.
1691
+ self._finished_requests_ids.append(seq_group.request_id)
1692
+
1693
+ # Free finished seqs
1694
+ self._free_finished_seqs(seq_group)
1695
+
1696
+ def free_finished_seq_groups(self) -> None:
1697
+ remaining: Deque[SequenceGroup] = deque()
1698
+ for seq_group in self.running:
1699
+ self._free_finished_seq_group(seq_group)
1700
+ if not seq_group.is_finished():
1701
+ remaining.append(seq_group)
1702
+
1703
+ self.running = remaining
1704
+
1705
+ # Handle async stopped sequence groups
1706
+ # (ones that reached max model len)
1707
+ if self._async_stopped:
1708
+ for seq_group in self._async_stopped:
1709
+ self._free_seq_group_cross_attn_blocks(seq_group)
1710
+ self._finished_requests_ids.append(seq_group.request_id)
1711
+
1712
+ # Free finished seqs
1713
+ self._free_finished_seqs(seq_group)
1714
+
1715
+ self._async_stopped.clear()
1716
+
1717
+ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
1718
+ self.block_manager.allocate(seq_group)
1719
+ for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
1720
+ seq.status = SequenceStatus.RUNNING
1721
+
1722
+ def _append_slots(
1723
+ self,
1724
+ seq_group: SequenceGroup,
1725
+ blocks_to_copy: List[Tuple[int, int]],
1726
+ enable_chunking: bool = False,
1727
+ ) -> None:
1728
+ """Appends new slots to the sequences in the given sequence group.
1729
+
1730
+ Args:
1731
+ seq_group (SequenceGroup): The sequence group containing the
1732
+ sequences to append slots to.
1733
+ blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
1734
+ ints, the first int is the source block index, and the second
1735
+ int is the destination block index. This list is updated with
1736
+ the new source and destination block indices for the appended
1737
+ slots.
1738
+ enable_chunking (bool): True if chunked prefill is enabled.
1739
+ """
1740
+ is_prefill: bool = seq_group.is_prefill()
1741
+ num_lookahead_slots: int = self._get_num_lookahead_slots(
1742
+ is_prefill, enable_chunking)
1743
+
1744
+ seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
1745
+ for seq in seq_group.get_seqs(status=seq_status):
1746
+ cows = self.block_manager.append_slots(seq, num_lookahead_slots)
1747
+ if len(cows) > 0:
1748
+ blocks_to_copy.extend(cows)
1749
+
1750
+ def _preempt(self, seq_group: SequenceGroup,
1751
+ blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
1752
+ # If preemption mode is not specified, we determine the mode as follows:
1753
+ # We use recomputation by default since it incurs lower overhead than
1754
+ # swapping. However, when the sequence group has multiple sequences
1755
+ # (e.g., beam search), recomputation is not currently supported. In
1756
+ # such a case, we use swapping instead.
1757
+ # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
1758
+ # As swapped sequences are prioritized over waiting sequences,
1759
+ # sequence groups with multiple sequences are implicitly prioritized
1760
+ # over sequence groups with a single sequence.
1761
+ # TODO(woosuk): Support recomputation for sequence groups with multiple
1762
+ # sequences. This may require a more sophisticated CUDA kernel.
1763
+ if self.user_specified_preemption_mode is None:
1764
+ if seq_group.get_max_num_running_seqs() == 1:
1765
+ preemption_mode = PreemptionMode.RECOMPUTE
1766
+ else:
1767
+ preemption_mode = PreemptionMode.SWAP
1768
+
1769
+ elif self.user_specified_preemption_mode == "swap":
1770
+ preemption_mode = PreemptionMode.SWAP
1771
+ else:
1772
+ preemption_mode = PreemptionMode.RECOMPUTE
1773
+
1774
+ if self.num_cumulative_preemption % 50 == 0:
1775
+ logger.warning(
1776
+ "Sequence group %s is preempted by %s mode because there is "
1777
+ "not enough KV cache space. This can affect the end-to-end "
1778
+ "performance. Increase gpu_memory_utilization or "
1779
+ "tensor_parallel_size to provide more KV cache memory. "
1780
+ "total_num_cumulative_preemption=%d",
1781
+ seq_group.request_id,
1782
+ preemption_mode,
1783
+ self.num_cumulative_preemption + 1,
1784
+ )
1785
+ self.num_cumulative_preemption += 1
1786
+
1787
+ if preemption_mode == PreemptionMode.RECOMPUTE:
1788
+ self._preempt_by_recompute(seq_group)
1789
+ elif preemption_mode == PreemptionMode.SWAP:
1790
+ self._preempt_by_swap(seq_group, blocks_to_swap_out)
1791
+ else:
1792
+ raise AssertionError("Invalid preemption mode.")
1793
+ return preemption_mode
1794
+
1795
+ def _preempt_by_recompute(
1796
+ self,
1797
+ seq_group: SequenceGroup,
1798
+ ) -> None:
1799
+ seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
1800
+ assert len(seqs) == 1
1801
+ for seq in seqs:
1802
+ seq.status = SequenceStatus.WAITING
1803
+ self.free_seq(seq)
1804
+ seq.reset_state_for_recompute()
1805
+ self._free_seq_group_cross_attn_blocks(seq_group)
1806
+
1807
+ def _preempt_by_swap(
1808
+ self,
1809
+ seq_group: SequenceGroup,
1810
+ blocks_to_swap_out: List[Tuple[int, int]],
1811
+ ) -> None:
1812
+ self._swap_out(seq_group, blocks_to_swap_out)
1813
+
1814
+ def _swap_in(
1815
+ self,
1816
+ seq_group: SequenceGroup,
1817
+ blocks_to_swap_in: List[Tuple[int, int]],
1818
+ ) -> None:
1819
+ mapping = self.block_manager.swap_in(seq_group)
1820
+ blocks_to_swap_in.extend(mapping)
1821
+ for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
1822
+ seq.status = SequenceStatus.RUNNING
1823
+
1824
+ def _swap_out(
1825
+ self,
1826
+ seq_group: SequenceGroup,
1827
+ blocks_to_swap_out: List[Tuple[int, int]],
1828
+ ) -> None:
1829
+ if not self.block_manager.can_swap_out(seq_group):
1830
+ # FIXME(woosuk): Abort the sequence group instead of aborting the
1831
+ # entire engine.
1832
+ raise RuntimeError(
1833
+ "Aborted due to the lack of CPU swap space. Please increase "
1834
+ "the swap space to avoid this error.")
1835
+ mapping = self.block_manager.swap_out(seq_group)
1836
+ blocks_to_swap_out.extend(mapping)
1837
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1838
+ seq.status = SequenceStatus.SWAPPED
1839
+
1840
+ def _passed_delay(self, now: float) -> bool:
1841
+ if self.prev_prompt:
1842
+ self.last_prompt_latency = now - self.prev_time
1843
+ self.prev_time, self.prev_prompt = now, False
1844
+ # Delay scheduling prompts to let waiting queue fill up
1845
+ if self.scheduler_config.delay_factor > 0 and self.waiting:
1846
+ earliest_arrival_time = min(
1847
+ [e.metrics.arrival_time for e in self.waiting])
1848
+ passed_delay = ((now - earliest_arrival_time)
1849
+ > (self.scheduler_config.delay_factor *
1850
+ self.last_prompt_latency) or not self.running)
1851
+ else:
1852
+ passed_delay = True
1853
+ return passed_delay
1854
+
1855
+ def _get_num_lookahead_slots(self, is_prefill: bool,
1856
+ enable_chunking: bool) -> int:
1857
+ """The number of slots to allocate per sequence per step, beyond known
1858
+ token ids. Speculative decoding uses these slots to store KV activations
1859
+ of tokens which may or may not be accepted.
1860
+ """
1861
+ return 0
1862
+
1863
+ def _get_num_new_uncached_and_cached_tokens(
1864
+ self,
1865
+ seq_group: SequenceGroup,
1866
+ status: SequenceStatus,
1867
+ enable_chunking: bool,
1868
+ budget: SchedulingBudget,
1869
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1870
+ ) -> Tuple[int, int]:
1871
+ """
1872
+ Returns the number of new uncached and cached tokens to schedule for a
1873
+ given sequence group that's in a given `status`.
1874
+
1875
+ The API could chunk the number of tokens to compute based on `budget`
1876
+ if `enable_chunking` is True. If a sequence group has multiple
1877
+ sequences (e.g., running beam search), it means it is in decoding
1878
+ phase, so chunking doesn't happen.
1879
+
1880
+ Returns (0, 0) if the new token cannot be computed due to token budget.
1881
+
1882
+ The cached tokens's blocks are already computed, and the attention
1883
+ backend will reuse the cached blocks rather than recomputing them. So
1884
+ the scheduler could schedule these cached tokens "for free".
1885
+
1886
+ Args:
1887
+ seq_group: The sequence group to get the number of new tokens to
1888
+ schedule.
1889
+ status: The status of the sequences to get the number of new tokens
1890
+ to schedule.
1891
+ enable_chunking: Whether to chunk the number of tokens to compute.
1892
+ budget: The budget to chunk the number of tokens to compute.
1893
+ partial_prefill_metadata: information about the partial prefills
1894
+ that are currently running
1895
+
1896
+
1897
+ Returns:
1898
+ A tuple of two ints. The first int is the number of new uncached
1899
+ tokens to schedule. The second int is the number of cached tokens.
1900
+ If no more new tokens can be scheduled, returns (0, 0).
1901
+ """
1902
+ num_cached_new_tokens = 0
1903
+ num_uncached_new_tokens = 0
1904
+
1905
+ seqs = seq_group.get_seqs(status=status)
1906
+ # Compute the number of new uncached and cached tokens for
1907
+ # each sequence.
1908
+ for seq in seqs:
1909
+ if not seq.is_prefill():
1910
+ # Decode sequences should always just have 1 uncached token
1911
+ # TODO(rickyx): Actually is this still correct for multi-step?
1912
+ num_uncached_new_tokens += 1
1913
+ continue
1914
+
1915
+ num_computed_tokens_seq = seq.get_num_computed_tokens()
1916
+ all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
1917
+ if not self.cache_config.enable_prefix_caching:
1918
+ # If prefix caching is not enabled, all new tokens are uncached.
1919
+ num_uncached_new_tokens += all_num_new_tokens_seq
1920
+ continue
1921
+
1922
+ # NOTE: the cache token might be currently in a block that's in an
1923
+ # evictor meaning that it's not yet allocated. However, we don't
1924
+ # exclude such tokens in the cache count because it will be
1925
+ # guaranteed to be allocated later if the sequence can be allocated.
1926
+ num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
1927
+ seq)
1928
+
1929
+ # Sanity check.
1930
+ if num_cached_tokens_seq < num_computed_tokens_seq:
1931
+ # This should only happen with chunked prefill, and
1932
+ # the seq is still in prefill. The `num_cached_tokens_seq`
1933
+ # is the value we calculated on scheduling the first prefill.
1934
+ # For subsequent continuous prefill steps, we cached the
1935
+ # number of cache tokens for the sequence so the cached token
1936
+ # count could be less than the number of computed tokens.
1937
+ # See comments on `ComputedBlocksTracker` for more details.
1938
+ assert (
1939
+ seq.is_prefill() and seq.status == SequenceStatus.RUNNING
1940
+ and self.scheduler_config.chunked_prefill_enabled
1941
+ ), ("Number of cached tokens should not be less than the "
1942
+ "number of computed tokens for a sequence that's still "
1943
+ f"in prefill. But there are {num_cached_tokens_seq} cached "
1944
+ f"tokens and {num_computed_tokens_seq} computed tokens "
1945
+ f"for sequence {seq.seq_id}.")
1946
+
1947
+ num_cached_new_tokens_seq = max(
1948
+ 0, num_cached_tokens_seq - num_computed_tokens_seq)
1949
+ num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
1950
+ num_cached_new_tokens_seq)
1951
+
1952
+ num_uncached_new_tokens += num_uncached_new_tokens_seq
1953
+ num_cached_new_tokens += num_cached_new_tokens_seq
1954
+
1955
+ if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
1956
+ # For a fully cached hit sequence, we actually need to recompute the
1957
+ # last token. So we need at least 1 uncached token to schedule.
1958
+ # See ModelRunner._compute_for_prefix_cache_hit for more details.
1959
+ num_uncached_new_tokens = 1
1960
+ num_cached_new_tokens -= 1
1961
+
1962
+ if enable_chunking and len(seqs) == 1:
1963
+ # Chunk if a running request cannot fit in the given budget.
1964
+ # If number of seq > 1, it means it is doing beam search
1965
+ # in a decode phase. Do not chunk.
1966
+ num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
1967
+ self.scheduler_config,
1968
+ self.cache_config,
1969
+ budget,
1970
+ self._get_prompt_limit(seq_group),
1971
+ num_uncached_new_tokens,
1972
+ self.partial_prefill_budget_lookup_list,
1973
+ partial_prefill_metadata,
1974
+ )
1975
+
1976
+ return num_uncached_new_tokens, num_cached_new_tokens
1977
+
1978
+ @staticmethod
1979
+ def _chunk_new_tokens_to_schedule(
1980
+ scheduler_config: SchedulerConfig,
1981
+ cache_config: CacheConfig,
1982
+ budget: SchedulingBudget,
1983
+ prompt_limit: int,
1984
+ num_new_tokens: int,
1985
+ partial_prefill_budget_lookup_list: List[int],
1986
+ partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
1987
+ ) -> int:
1988
+ """
1989
+ Chunks the number of new tokens to schedule based on the budget when
1990
+ chunked prefill is enabled.
1991
+
1992
+ Args:
1993
+ scheduler_config: The scheduler config.
1994
+ cache_config: The cache config.
1995
+ budget: The budget to chunk the number of tokens to compute.
1996
+ prompt_limit: The maximum number of tokens allowed in a prompt.
1997
+ num_new_tokens: The number of new tokens to schedule.
1998
+
1999
+ Returns:
2000
+ The number of new tokens to schedule after chunking.
2001
+ """
2002
+ remaining_token_budget = budget.remaining_token_budget()
2003
+
2004
+ # Get the number of tokens to allocate to this prefill slot
2005
+ prefill_slot_budget = (
2006
+ remaining_token_budget if partial_prefill_metadata is None else
2007
+ partial_prefill_budget_lookup_list[
2008
+ partial_prefill_metadata.schedulable_prefills])
2009
+
2010
+ if cache_config.enable_prefix_caching:
2011
+ # When prefix caching is enabled and we're partially prefilling
2012
+ # a sequence, we always allocate a number of new tokens that is
2013
+ # divisible by the block size to avoid partial block matching.
2014
+ block_size = cache_config.block_size
2015
+ # Don't exceed either the total budget or slot budget.
2016
+ # Take min of those and get the next lowest multiple of the
2017
+ # block size:
2018
+ remaining_token_budget = (
2019
+ min(remaining_token_budget, prefill_slot_budget) //
2020
+ block_size) * block_size
2021
+ # NB: In the case where num_new_tokens < budget, we are
2022
+ # finishing prefill for this sequence, so we do not need to
2023
+ # allocate a full block.
2024
+
2025
+ num_new_tokens = min(num_new_tokens, remaining_token_budget,
2026
+ prefill_slot_budget)
2027
+
2028
+ return num_new_tokens