vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vllm-cpu might be problematic. Click here for more details.
- vllm/_C.abi3.so +0 -0
- vllm/__init__.py +170 -0
- vllm/_custom_ops.py +1536 -0
- vllm/_ipex_ops.py +241 -0
- vllm/_version.py +34 -0
- vllm/adapter_commons/__init__.py +0 -0
- vllm/adapter_commons/layers.py +16 -0
- vllm/adapter_commons/models.py +105 -0
- vllm/adapter_commons/request.py +25 -0
- vllm/adapter_commons/utils.py +92 -0
- vllm/adapter_commons/worker_manager.py +38 -0
- vllm/assets/__init__.py +0 -0
- vllm/assets/audio.py +38 -0
- vllm/assets/base.py +40 -0
- vllm/assets/image.py +31 -0
- vllm/assets/video.py +103 -0
- vllm/attention/__init__.py +19 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +306 -0
- vllm/attention/backends/blocksparse_attn.py +457 -0
- vllm/attention/backends/cpu_mla.py +303 -0
- vllm/attention/backends/flash_attn.py +999 -0
- vllm/attention/backends/flashinfer.py +1092 -0
- vllm/attention/backends/flashmla.py +242 -0
- vllm/attention/backends/hpu_attn.py +301 -0
- vllm/attention/backends/ipex_attn.py +396 -0
- vllm/attention/backends/mla/__init__.py +0 -0
- vllm/attention/backends/mla/common.py +1444 -0
- vllm/attention/backends/pallas.py +346 -0
- vllm/attention/backends/placeholder_attn.py +399 -0
- vllm/attention/backends/rocm_aiter_mla.py +412 -0
- vllm/attention/backends/rocm_flash_attn.py +969 -0
- vllm/attention/backends/torch_sdpa.py +691 -0
- vllm/attention/backends/triton_mla.py +113 -0
- vllm/attention/backends/utils.py +609 -0
- vllm/attention/backends/xformers.py +798 -0
- vllm/attention/layer.py +443 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
- vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
- vllm/attention/ops/blocksparse_attention/interface.py +238 -0
- vllm/attention/ops/blocksparse_attention/utils.py +244 -0
- vllm/attention/ops/chunked_prefill_paged_decode.py +366 -0
- vllm/attention/ops/flashmla.py +115 -0
- vllm/attention/ops/hpu_paged_attn.py +105 -0
- vllm/attention/ops/ipex_attn.py +193 -0
- vllm/attention/ops/merge_attn_states.py +42 -0
- vllm/attention/ops/nki_flash_attn.py +905 -0
- vllm/attention/ops/paged_attn.py +255 -0
- vllm/attention/ops/prefix_prefill.py +902 -0
- vllm/attention/ops/rocm_aiter_mla.py +42 -0
- vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
- vllm/attention/ops/triton_decode_attention.py +675 -0
- vllm/attention/ops/triton_flash_attention.py +1375 -0
- vllm/attention/ops/triton_merge_attn_states.py +96 -0
- vllm/attention/selector.py +186 -0
- vllm/attention/utils/fa_utils.py +54 -0
- vllm/beam_search.py +82 -0
- vllm/benchmarks/__init__.py +0 -0
- vllm/benchmarks/datasets.py +831 -0
- vllm/benchmarks/endpoint_request_func.py +160 -0
- vllm/benchmarks/latency.py +181 -0
- vllm/benchmarks/serve.py +925 -0
- vllm/benchmarks/throughput.py +608 -0
- vllm/benchmarks/utils.py +69 -0
- vllm/collect_env.py +795 -0
- vllm/compilation/__init__.py +0 -0
- vllm/compilation/backends.py +715 -0
- vllm/compilation/compiler_interface.py +437 -0
- vllm/compilation/counter.py +33 -0
- vllm/compilation/decorators.py +249 -0
- vllm/compilation/fix_functionalization.py +182 -0
- vllm/compilation/fusion.py +617 -0
- vllm/compilation/fx_utils.py +60 -0
- vllm/compilation/inductor_pass.py +114 -0
- vllm/compilation/monitor.py +38 -0
- vllm/compilation/multi_output_match.py +108 -0
- vllm/compilation/noop_elimination.py +135 -0
- vllm/compilation/pass_manager.py +74 -0
- vllm/compilation/sequence_parallelism.py +266 -0
- vllm/compilation/torch25_custom_graph_pass.py +41 -0
- vllm/compilation/vllm_inductor_pass.py +68 -0
- vllm/compilation/wrapper.py +129 -0
- vllm/config.py +4179 -0
- vllm/connections.py +170 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +398 -0
- vllm/core/block/common.py +370 -0
- vllm/core/block/cpu_gpu_block_allocator.py +440 -0
- vllm/core/block/interfaces.py +318 -0
- vllm/core/block/naive_block.py +465 -0
- vllm/core/block/prefix_caching_block.py +1134 -0
- vllm/core/block/utils.py +27 -0
- vllm/core/block_manager.py +520 -0
- vllm/core/evictor.py +156 -0
- vllm/core/interfaces.py +134 -0
- vllm/core/placeholder_block_space_manager.py +99 -0
- vllm/core/scheduler.py +2060 -0
- vllm/device_allocator/__init__.py +0 -0
- vllm/device_allocator/cumem.py +280 -0
- vllm/distributed/__init__.py +5 -0
- vllm/distributed/communication_op.py +40 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/base_device_communicator.py +151 -0
- vllm/distributed/device_communicators/cpu_communicator.py +139 -0
- vllm/distributed/device_communicators/cuda_communicator.py +131 -0
- vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +301 -0
- vllm/distributed/device_communicators/custom_all_reduce_utils.py +257 -0
- vllm/distributed/device_communicators/hpu_communicator.py +45 -0
- vllm/distributed/device_communicators/neuron_communicator.py +19 -0
- vllm/distributed/device_communicators/pynccl.py +217 -0
- vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
- vllm/distributed/device_communicators/shm_broadcast.py +557 -0
- vllm/distributed/device_communicators/tpu_communicator.py +93 -0
- vllm/distributed/device_communicators/xpu_communicator.py +54 -0
- vllm/distributed/kv_transfer/README.md +29 -0
- vllm/distributed/kv_transfer/__init__.py +11 -0
- vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
- vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
- vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
- vllm/distributed/kv_transfer/kv_connector/factory.py +107 -0
- vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
- vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +201 -0
- vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
- vllm/distributed/kv_transfer/kv_connector/utils.py +90 -0
- vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +8 -0
- vllm/distributed/kv_transfer/kv_connector/v1/base.py +209 -0
- vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +131 -0
- vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
- vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
- vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
- vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
- vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
- vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
- vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
- vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
- vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
- vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
- vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
- vllm/distributed/parallel_state.py +1209 -0
- vllm/distributed/utils.py +366 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +1724 -0
- vllm/engine/async_llm_engine.py +1261 -0
- vllm/engine/async_timeout.py +191 -0
- vllm/engine/llm_engine.py +2150 -0
- vllm/engine/metrics.py +717 -0
- vllm/engine/metrics_types.py +96 -0
- vllm/engine/multiprocessing/__init__.py +183 -0
- vllm/engine/multiprocessing/client.py +745 -0
- vllm/engine/multiprocessing/engine.py +450 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +74 -0
- vllm/engine/output_processor/multi_step.py +210 -0
- vllm/engine/output_processor/single_step.py +136 -0
- vllm/engine/output_processor/stop_checker.py +130 -0
- vllm/engine/output_processor/util.py +27 -0
- vllm/engine/protocol.py +302 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +177 -0
- vllm/entrypoints/chat_utils.py +1259 -0
- vllm/entrypoints/cli/__init__.py +0 -0
- vllm/entrypoints/cli/benchmark/__init__.py +0 -0
- vllm/entrypoints/cli/benchmark/base.py +38 -0
- vllm/entrypoints/cli/benchmark/latency.py +29 -0
- vllm/entrypoints/cli/benchmark/main.py +53 -0
- vllm/entrypoints/cli/benchmark/serve.py +29 -0
- vllm/entrypoints/cli/benchmark/throughput.py +29 -0
- vllm/entrypoints/cli/collect_env.py +35 -0
- vllm/entrypoints/cli/main.py +59 -0
- vllm/entrypoints/cli/openai.py +175 -0
- vllm/entrypoints/cli/serve.py +59 -0
- vllm/entrypoints/cli/types.py +24 -0
- vllm/entrypoints/launcher.py +146 -0
- vllm/entrypoints/llm.py +1450 -0
- vllm/entrypoints/logger.py +44 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +1130 -0
- vllm/entrypoints/openai/cli_args.py +296 -0
- vllm/entrypoints/openai/logits_processors.py +89 -0
- vllm/entrypoints/openai/protocol.py +1806 -0
- vllm/entrypoints/openai/run_batch.py +439 -0
- vllm/entrypoints/openai/serving_chat.py +1210 -0
- vllm/entrypoints/openai/serving_completion.py +557 -0
- vllm/entrypoints/openai/serving_embedding.py +245 -0
- vllm/entrypoints/openai/serving_engine.py +569 -0
- vllm/entrypoints/openai/serving_models.py +314 -0
- vllm/entrypoints/openai/serving_pooling.py +237 -0
- vllm/entrypoints/openai/serving_score.py +439 -0
- vllm/entrypoints/openai/serving_tokenization.py +147 -0
- vllm/entrypoints/openai/serving_transcription.py +421 -0
- vllm/entrypoints/openai/tool_parsers/__init__.py +19 -0
- vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
- vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +254 -0
- vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +232 -0
- vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
- vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +211 -0
- vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +303 -0
- vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +262 -0
- vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
- vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +110 -0
- vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +292 -0
- vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
- vllm/entrypoints/score_utils.py +49 -0
- vllm/entrypoints/ssl.py +74 -0
- vllm/entrypoints/utils.py +136 -0
- vllm/env_override.py +34 -0
- vllm/envs.py +800 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/executor_base.py +400 -0
- vllm/executor/mp_distributed_executor.py +243 -0
- vllm/executor/msgspec_utils.py +29 -0
- vllm/executor/multiproc_worker_utils.py +312 -0
- vllm/executor/ray_distributed_executor.py +700 -0
- vllm/executor/ray_utils.py +400 -0
- vllm/executor/uniproc_executor.py +141 -0
- vllm/forward_context.py +159 -0
- vllm/inputs/__init__.py +37 -0
- vllm/inputs/data.py +248 -0
- vllm/inputs/parse.py +121 -0
- vllm/inputs/preprocess.py +745 -0
- vllm/inputs/registry.py +212 -0
- vllm/jsontree.py +79 -0
- vllm/logger.py +210 -0
- vllm/logging_utils/__init__.py +7 -0
- vllm/logging_utils/formatter.py +17 -0
- vllm/logits_process.py +121 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +335 -0
- vllm/lora/layers.py +1263 -0
- vllm/lora/lora.py +198 -0
- vllm/lora/models.py +802 -0
- vllm/lora/ops/__init__.py +0 -0
- vllm/lora/ops/torch_ops/__init__.py +15 -0
- vllm/lora/ops/torch_ops/lora_ops.py +115 -0
- vllm/lora/ops/triton_ops/__init__.py +11 -0
- vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
- vllm/lora/ops/triton_ops/lora_expand.py +293 -0
- vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
- vllm/lora/ops/triton_ops/lora_shrink.py +247 -0
- vllm/lora/ops/triton_ops/utils.py +121 -0
- vllm/lora/peft_helper.py +115 -0
- vllm/lora/punica_wrapper/__init__.py +9 -0
- vllm/lora/punica_wrapper/punica_base.py +483 -0
- vllm/lora/punica_wrapper/punica_cpu.py +348 -0
- vllm/lora/punica_wrapper/punica_gpu.py +289 -0
- vllm/lora/punica_wrapper/punica_hpu.py +144 -0
- vllm/lora/punica_wrapper/punica_selector.py +20 -0
- vllm/lora/punica_wrapper/utils.py +161 -0
- vllm/lora/request.py +97 -0
- vllm/lora/resolver.py +83 -0
- vllm/lora/utils.py +237 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +15 -0
- vllm/model_executor/custom_op.py +153 -0
- vllm/model_executor/guided_decoding/__init__.py +180 -0
- vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
- vllm/model_executor/guided_decoding/guidance_logits_processors.py +85 -0
- vllm/model_executor/guided_decoding/guided_fields.py +42 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +271 -0
- vllm/model_executor/guided_decoding/reasoner/__init__.py +35 -0
- vllm/model_executor/guided_decoding/utils.py +241 -0
- vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +368 -0
- vllm/model_executor/layers/fused_moe/__init__.py +51 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/README +12 -0
- vllm/model_executor/layers/fused_moe/cutlass_moe.py +180 -0
- vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +294 -0
- vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +374 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +1539 -0
- vllm/model_executor/layers/fused_moe/layer.py +949 -0
- vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
- vllm/model_executor/layers/fused_moe/moe_pallas.py +64 -0
- vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
- vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +416 -0
- vllm/model_executor/layers/fused_moe/utils.py +48 -0
- vllm/model_executor/layers/layernorm.py +277 -0
- vllm/model_executor/layers/lightning_attn.py +651 -0
- vllm/model_executor/layers/linear.py +1518 -0
- vllm/model_executor/layers/logits_processor.py +196 -0
- vllm/model_executor/layers/mamba/__init__.py +0 -0
- vllm/model_executor/layers/mamba/mamba2_metadata.py +109 -0
- vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
- vllm/model_executor/layers/mamba/mamba_mixer2.py +538 -0
- vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
- vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
- vllm/model_executor/layers/mamba/ops/mamba_ssm.py +415 -0
- vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
- vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
- vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
- vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
- vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
- vllm/model_executor/layers/pooler.py +336 -0
- vllm/model_executor/layers/quantization/__init__.py +153 -0
- vllm/model_executor/layers/quantization/aqlm.py +374 -0
- vllm/model_executor/layers/quantization/awq.py +184 -0
- vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
- vllm/model_executor/layers/quantization/awq_triton.py +319 -0
- vllm/model_executor/layers/quantization/base_config.py +145 -0
- vllm/model_executor/layers/quantization/bitblas.py +459 -0
- vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
- vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
- vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +624 -0
- vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1100 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +20 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +119 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
- vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
- vllm/model_executor/layers/quantization/compressed_tensors/utils.py +213 -0
- vllm/model_executor/layers/quantization/deepspeedfp.py +193 -0
- vllm/model_executor/layers/quantization/experts_int8.py +194 -0
- vllm/model_executor/layers/quantization/fbgemm_fp8.py +168 -0
- vllm/model_executor/layers/quantization/fp8.py +832 -0
- vllm/model_executor/layers/quantization/gguf.py +408 -0
- vllm/model_executor/layers/quantization/gptq.py +276 -0
- vllm/model_executor/layers/quantization/gptq_bitblas.py +438 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +643 -0
- vllm/model_executor/layers/quantization/gptq_marlin_24.py +295 -0
- vllm/model_executor/layers/quantization/hqq_marlin.py +328 -0
- vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
- vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +132 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
- vllm/model_executor/layers/quantization/kv_cache.py +137 -0
- vllm/model_executor/layers/quantization/marlin.py +259 -0
- vllm/model_executor/layers/quantization/modelopt.py +410 -0
- vllm/model_executor/layers/quantization/moe_wna16.py +447 -0
- vllm/model_executor/layers/quantization/neuron_quant.py +67 -0
- vllm/model_executor/layers/quantization/ptpc_fp8.py +125 -0
- vllm/model_executor/layers/quantization/qqq.py +273 -0
- vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
- vllm/model_executor/layers/quantization/quark/quark.py +385 -0
- vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
- vllm/model_executor/layers/quantization/quark/schemes/__init__.py +7 -0
- vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
- vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +142 -0
- vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
- vllm/model_executor/layers/quantization/quark/utils.py +102 -0
- vllm/model_executor/layers/quantization/schema.py +85 -0
- vllm/model_executor/layers/quantization/torchao.py +127 -0
- vllm/model_executor/layers/quantization/tpu_int8.py +119 -0
- vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
- vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
- vllm/model_executor/layers/quantization/utils/bitblas_utils.py +198 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/fp8_utils.py +523 -0
- vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
- vllm/model_executor/layers/quantization/utils/int8_utils.py +459 -0
- vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
- vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils.py +413 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +110 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +127 -0
- vllm/model_executor/layers/quantization/utils/quant_utils.py +571 -0
- vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
- vllm/model_executor/layers/rejection_sampler.py +400 -0
- vllm/model_executor/layers/resampler.py +269 -0
- vllm/model_executor/layers/rotary_embedding.py +1598 -0
- vllm/model_executor/layers/sampler.py +1221 -0
- vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
- vllm/model_executor/layers/typical_acceptance_sampler.py +172 -0
- vllm/model_executor/layers/utils.py +99 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +485 -0
- vllm/model_executor/model_loader/__init__.py +20 -0
- vllm/model_executor/model_loader/loader.py +1542 -0
- vllm/model_executor/model_loader/neuron.py +243 -0
- vllm/model_executor/model_loader/tensorizer.py +468 -0
- vllm/model_executor/model_loader/utils.py +171 -0
- vllm/model_executor/model_loader/weight_utils.py +749 -0
- vllm/model_executor/models/__init__.py +27 -0
- vllm/model_executor/models/adapters.py +247 -0
- vllm/model_executor/models/arctic.py +559 -0
- vllm/model_executor/models/aria.py +656 -0
- vllm/model_executor/models/aya_vision.py +461 -0
- vllm/model_executor/models/baichuan.py +469 -0
- vllm/model_executor/models/bamba.py +542 -0
- vllm/model_executor/models/bart.py +936 -0
- vllm/model_executor/models/bert.py +725 -0
- vllm/model_executor/models/blip.py +337 -0
- vllm/model_executor/models/blip2.py +717 -0
- vllm/model_executor/models/bloom.py +358 -0
- vllm/model_executor/models/chameleon.py +1135 -0
- vllm/model_executor/models/chatglm.py +476 -0
- vllm/model_executor/models/clip.py +410 -0
- vllm/model_executor/models/commandr.py +466 -0
- vllm/model_executor/models/constant_size_cache.py +136 -0
- vllm/model_executor/models/dbrx.py +469 -0
- vllm/model_executor/models/deepseek.py +484 -0
- vllm/model_executor/models/deepseek_mtp.py +266 -0
- vllm/model_executor/models/deepseek_v2.py +830 -0
- vllm/model_executor/models/deepseek_vl2.py +647 -0
- vllm/model_executor/models/eagle.py +247 -0
- vllm/model_executor/models/exaone.py +548 -0
- vllm/model_executor/models/fairseq2_llama.py +153 -0
- vllm/model_executor/models/falcon.py +508 -0
- vllm/model_executor/models/florence2.py +1102 -0
- vllm/model_executor/models/fuyu.py +388 -0
- vllm/model_executor/models/gemma.py +423 -0
- vllm/model_executor/models/gemma2.py +423 -0
- vllm/model_executor/models/gemma3.py +531 -0
- vllm/model_executor/models/gemma3_mm.py +716 -0
- vllm/model_executor/models/glm.py +22 -0
- vllm/model_executor/models/glm4.py +303 -0
- vllm/model_executor/models/glm4v.py +647 -0
- vllm/model_executor/models/gpt2.py +313 -0
- vllm/model_executor/models/gpt_bigcode.py +336 -0
- vllm/model_executor/models/gpt_j.py +337 -0
- vllm/model_executor/models/gpt_neox.py +330 -0
- vllm/model_executor/models/granite.py +494 -0
- vllm/model_executor/models/granite_speech.py +777 -0
- vllm/model_executor/models/granitemoe.py +435 -0
- vllm/model_executor/models/granitemoeshared.py +339 -0
- vllm/model_executor/models/gritlm.py +245 -0
- vllm/model_executor/models/grok1.py +560 -0
- vllm/model_executor/models/h2ovl.py +542 -0
- vllm/model_executor/models/idefics2_vision_model.py +387 -0
- vllm/model_executor/models/idefics3.py +767 -0
- vllm/model_executor/models/interfaces.py +569 -0
- vllm/model_executor/models/interfaces_base.py +163 -0
- vllm/model_executor/models/intern_vit.py +476 -0
- vllm/model_executor/models/internlm2.py +453 -0
- vllm/model_executor/models/internlm2_ve.py +146 -0
- vllm/model_executor/models/internvl.py +945 -0
- vllm/model_executor/models/jais.py +371 -0
- vllm/model_executor/models/jamba.py +590 -0
- vllm/model_executor/models/kimi_vl.py +577 -0
- vllm/model_executor/models/llama.py +619 -0
- vllm/model_executor/models/llama4.py +530 -0
- vllm/model_executor/models/llama_eagle.py +152 -0
- vllm/model_executor/models/llama_eagle3.py +232 -0
- vllm/model_executor/models/llava.py +869 -0
- vllm/model_executor/models/llava_next.py +582 -0
- vllm/model_executor/models/llava_next_video.py +470 -0
- vllm/model_executor/models/llava_onevision.py +954 -0
- vllm/model_executor/models/mamba.py +271 -0
- vllm/model_executor/models/mamba2.py +302 -0
- vllm/model_executor/models/mamba_cache.py +76 -0
- vllm/model_executor/models/medusa.py +210 -0
- vllm/model_executor/models/minicpm.py +592 -0
- vllm/model_executor/models/minicpm3.py +229 -0
- vllm/model_executor/models/minicpmo.py +725 -0
- vllm/model_executor/models/minicpmv.py +1287 -0
- vllm/model_executor/models/minimax_cache.py +35 -0
- vllm/model_executor/models/minimax_text_01.py +1261 -0
- vllm/model_executor/models/mistral3.py +598 -0
- vllm/model_executor/models/mixtral.py +485 -0
- vllm/model_executor/models/mixtral_quant.py +447 -0
- vllm/model_executor/models/mllama.py +1623 -0
- vllm/model_executor/models/mllama4.py +838 -0
- vllm/model_executor/models/mlp_speculator.py +205 -0
- vllm/model_executor/models/modernbert.py +325 -0
- vllm/model_executor/models/module_mapping.py +71 -0
- vllm/model_executor/models/molmo.py +1567 -0
- vllm/model_executor/models/moonvit.py +628 -0
- vllm/model_executor/models/mpt.py +329 -0
- vllm/model_executor/models/nemotron.py +506 -0
- vllm/model_executor/models/nemotron_nas.py +446 -0
- vllm/model_executor/models/nvlm_d.py +212 -0
- vllm/model_executor/models/olmo.py +390 -0
- vllm/model_executor/models/olmo2.py +412 -0
- vllm/model_executor/models/olmoe.py +449 -0
- vllm/model_executor/models/opt.py +410 -0
- vllm/model_executor/models/orion.py +356 -0
- vllm/model_executor/models/paligemma.py +397 -0
- vllm/model_executor/models/persimmon.py +342 -0
- vllm/model_executor/models/phi.py +354 -0
- vllm/model_executor/models/phi3.py +18 -0
- vllm/model_executor/models/phi3_small.py +463 -0
- vllm/model_executor/models/phi3v.py +722 -0
- vllm/model_executor/models/phi4mm.py +1263 -0
- vllm/model_executor/models/phi4mm_audio.py +1232 -0
- vllm/model_executor/models/phi4mm_utils.py +1883 -0
- vllm/model_executor/models/phimoe.py +666 -0
- vllm/model_executor/models/pixtral.py +1281 -0
- vllm/model_executor/models/plamo2.py +736 -0
- vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
- vllm/model_executor/models/qwen.py +360 -0
- vllm/model_executor/models/qwen2.py +552 -0
- vllm/model_executor/models/qwen2_5_omni_thinker.py +901 -0
- vllm/model_executor/models/qwen2_5_vl.py +1136 -0
- vllm/model_executor/models/qwen2_audio.py +402 -0
- vllm/model_executor/models/qwen2_moe.py +531 -0
- vllm/model_executor/models/qwen2_rm.py +130 -0
- vllm/model_executor/models/qwen2_vl.py +1409 -0
- vllm/model_executor/models/qwen3.py +319 -0
- vllm/model_executor/models/qwen3_moe.py +528 -0
- vllm/model_executor/models/qwen_vl.py +784 -0
- vllm/model_executor/models/registry.py +611 -0
- vllm/model_executor/models/roberta.py +332 -0
- vllm/model_executor/models/siglip.py +522 -0
- vllm/model_executor/models/skyworkr1v.py +949 -0
- vllm/model_executor/models/smolvlm.py +51 -0
- vllm/model_executor/models/solar.py +504 -0
- vllm/model_executor/models/stablelm.py +349 -0
- vllm/model_executor/models/starcoder2.py +355 -0
- vllm/model_executor/models/telechat2.py +139 -0
- vllm/model_executor/models/teleflm.py +78 -0
- vllm/model_executor/models/transformers.py +442 -0
- vllm/model_executor/models/ultravox.py +655 -0
- vllm/model_executor/models/utils.py +714 -0
- vllm/model_executor/models/vision.py +149 -0
- vllm/model_executor/models/whisper.py +746 -0
- vllm/model_executor/models/zamba2.py +1008 -0
- vllm/model_executor/parameter.py +458 -0
- vllm/model_executor/pooling_metadata.py +71 -0
- vllm/model_executor/sampling_metadata.py +596 -0
- vllm/model_executor/utils.py +53 -0
- vllm/multimodal/__init__.py +31 -0
- vllm/multimodal/audio.py +105 -0
- vllm/multimodal/base.py +218 -0
- vllm/multimodal/hasher.py +103 -0
- vllm/multimodal/image.py +77 -0
- vllm/multimodal/inputs.py +843 -0
- vllm/multimodal/parse.py +454 -0
- vllm/multimodal/processing.py +1760 -0
- vllm/multimodal/profiling.py +274 -0
- vllm/multimodal/registry.py +321 -0
- vllm/multimodal/utils.py +386 -0
- vllm/multimodal/video.py +166 -0
- vllm/outputs.py +521 -0
- vllm/platforms/__init__.py +286 -0
- vllm/platforms/cpu.py +182 -0
- vllm/platforms/cuda.py +463 -0
- vllm/platforms/hpu.py +94 -0
- vllm/platforms/interface.py +427 -0
- vllm/platforms/neuron.py +69 -0
- vllm/platforms/rocm.py +346 -0
- vllm/platforms/tpu.py +174 -0
- vllm/platforms/xpu.py +142 -0
- vllm/plugins/__init__.py +82 -0
- vllm/pooling_params.py +53 -0
- vllm/profiler/__init__.py +7 -0
- vllm/profiler/layerwise_profile.py +374 -0
- vllm/profiler/utils.py +147 -0
- vllm/prompt_adapter/__init__.py +0 -0
- vllm/prompt_adapter/layers.py +82 -0
- vllm/prompt_adapter/models.py +357 -0
- vllm/prompt_adapter/request.py +36 -0
- vllm/prompt_adapter/utils.py +97 -0
- vllm/prompt_adapter/worker_manager.py +178 -0
- vllm/py.typed +2 -0
- vllm/reasoning/__init__.py +12 -0
- vllm/reasoning/abs_reasoning_parsers.py +189 -0
- vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
- vllm/reasoning/granite_reasoning_parser.py +362 -0
- vllm/sampling_params.py +598 -0
- vllm/scalar_type.py +335 -0
- vllm/scripts.py +14 -0
- vllm/sequence.py +1486 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +505 -0
- vllm/spec_decode/draft_model_runner.py +335 -0
- vllm/spec_decode/interfaces.py +98 -0
- vllm/spec_decode/medusa_worker.py +137 -0
- vllm/spec_decode/metrics.py +212 -0
- vllm/spec_decode/mlp_speculator_worker.py +93 -0
- vllm/spec_decode/mqa_scorer.py +159 -0
- vllm/spec_decode/multi_step_worker.py +416 -0
- vllm/spec_decode/ngram_worker.py +195 -0
- vllm/spec_decode/proposer_worker_base.py +58 -0
- vllm/spec_decode/smaller_tp_proposer_worker.py +194 -0
- vllm/spec_decode/spec_decode_worker.py +1324 -0
- vllm/spec_decode/target_model_runner.py +44 -0
- vllm/spec_decode/top1_proposer.py +274 -0
- vllm/spec_decode/util.py +276 -0
- vllm/test_utils.py +129 -0
- vllm/third_party/__init__.py +0 -0
- vllm/third_party/pynvml.py +6139 -0
- vllm/tracing.py +130 -0
- vllm/transformers_utils/__init__.py +19 -0
- vllm/transformers_utils/config.py +813 -0
- vllm/transformers_utils/configs/__init__.py +52 -0
- vllm/transformers_utils/configs/arctic.py +206 -0
- vllm/transformers_utils/configs/chatglm.py +71 -0
- vllm/transformers_utils/configs/cohere2.py +194 -0
- vllm/transformers_utils/configs/dbrx.py +280 -0
- vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
- vllm/transformers_utils/configs/eagle.py +65 -0
- vllm/transformers_utils/configs/exaone.py +191 -0
- vllm/transformers_utils/configs/falcon.py +89 -0
- vllm/transformers_utils/configs/h2ovl.py +15 -0
- vllm/transformers_utils/configs/internvl.py +53 -0
- vllm/transformers_utils/configs/jais.py +237 -0
- vllm/transformers_utils/configs/kimi_vl.py +36 -0
- vllm/transformers_utils/configs/medusa.py +62 -0
- vllm/transformers_utils/configs/mllama.py +30 -0
- vllm/transformers_utils/configs/mlp_speculator.py +67 -0
- vllm/transformers_utils/configs/moonvit.py +32 -0
- vllm/transformers_utils/configs/mpt.py +179 -0
- vllm/transformers_utils/configs/nemotron.py +204 -0
- vllm/transformers_utils/configs/nvlm_d.py +14 -0
- vllm/transformers_utils/configs/skyworkr1v.py +53 -0
- vllm/transformers_utils/configs/solar.py +246 -0
- vllm/transformers_utils/configs/telechat2.py +63 -0
- vllm/transformers_utils/configs/ultravox.py +107 -0
- vllm/transformers_utils/detokenizer.py +167 -0
- vllm/transformers_utils/detokenizer_utils.py +188 -0
- vllm/transformers_utils/processor.py +210 -0
- vllm/transformers_utils/processors/__init__.py +6 -0
- vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
- vllm/transformers_utils/s3_utils.py +161 -0
- vllm/transformers_utils/tokenizer.py +291 -0
- vllm/transformers_utils/tokenizer_base.py +146 -0
- vllm/transformers_utils/tokenizer_group.py +110 -0
- vllm/transformers_utils/tokenizers/__init__.py +9 -0
- vllm/transformers_utils/tokenizers/mistral.py +483 -0
- vllm/transformers_utils/utils.py +98 -0
- vllm/triton_utils/__init__.py +5 -0
- vllm/triton_utils/importing.py +53 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +255 -0
- vllm/utils.py +2692 -0
- vllm/v1/__init__.py +0 -0
- vllm/v1/attention/__init__.py +0 -0
- vllm/v1/attention/backends/__init__.py +0 -0
- vllm/v1/attention/backends/flash_attn.py +783 -0
- vllm/v1/attention/backends/flashinfer.py +638 -0
- vllm/v1/attention/backends/mla/__init__.py +0 -0
- vllm/v1/attention/backends/mla/common.py +974 -0
- vllm/v1/attention/backends/mla/flashmla.py +149 -0
- vllm/v1/attention/backends/mla/triton_mla.py +118 -0
- vllm/v1/attention/backends/pallas.py +221 -0
- vllm/v1/attention/backends/triton_attn.py +198 -0
- vllm/v1/core/__init__.py +0 -0
- vllm/v1/core/block_pool.py +281 -0
- vllm/v1/core/encoder_cache_manager.py +149 -0
- vllm/v1/core/kv_cache_manager.py +385 -0
- vllm/v1/core/kv_cache_utils.py +744 -0
- vllm/v1/core/sched/__init__.py +0 -0
- vllm/v1/core/sched/interface.py +134 -0
- vllm/v1/core/sched/output.py +126 -0
- vllm/v1/core/sched/scheduler.py +838 -0
- vllm/v1/core/sched/utils.py +22 -0
- vllm/v1/core/specialized_manager.py +161 -0
- vllm/v1/engine/__init__.py +166 -0
- vllm/v1/engine/async_llm.py +532 -0
- vllm/v1/engine/core.py +701 -0
- vllm/v1/engine/core_client.py +942 -0
- vllm/v1/engine/detokenizer.py +260 -0
- vllm/v1/engine/exceptions.py +16 -0
- vllm/v1/engine/llm_engine.py +285 -0
- vllm/v1/engine/logprobs.py +198 -0
- vllm/v1/engine/mm_input_cache.py +82 -0
- vllm/v1/engine/output_processor.py +420 -0
- vllm/v1/engine/parallel_sampling.py +132 -0
- vllm/v1/engine/processor.py +387 -0
- vllm/v1/executor/__init__.py +0 -0
- vllm/v1/executor/abstract.py +112 -0
- vllm/v1/executor/multiproc_executor.py +480 -0
- vllm/v1/executor/ray_distributed_executor.py +61 -0
- vllm/v1/kv_cache_interface.py +166 -0
- vllm/v1/metrics/__init__.py +0 -0
- vllm/v1/metrics/loggers.py +498 -0
- vllm/v1/metrics/stats.py +238 -0
- vllm/v1/outputs.py +111 -0
- vllm/v1/request.py +178 -0
- vllm/v1/sample/__init__.py +0 -0
- vllm/v1/sample/metadata.py +43 -0
- vllm/v1/sample/ops/__init__.py +0 -0
- vllm/v1/sample/ops/bad_words.py +38 -0
- vllm/v1/sample/ops/penalties.py +58 -0
- vllm/v1/sample/ops/topk_topp_sampler.py +315 -0
- vllm/v1/sample/rejection_sampler.py +631 -0
- vllm/v1/sample/sampler.py +270 -0
- vllm/v1/sample/tpu/__init__.py +0 -0
- vllm/v1/sample/tpu/metadata.py +118 -0
- vllm/v1/sample/tpu/sampler.py +154 -0
- vllm/v1/serial_utils.py +274 -0
- vllm/v1/spec_decode/__init__.py +0 -0
- vllm/v1/spec_decode/eagle.py +318 -0
- vllm/v1/spec_decode/metadata.py +61 -0
- vllm/v1/spec_decode/metrics.py +164 -0
- vllm/v1/spec_decode/ngram_proposer.py +131 -0
- vllm/v1/spec_decode/utils.py +18 -0
- vllm/v1/stats/__init__.py +0 -0
- vllm/v1/stats/common.py +453 -0
- vllm/v1/structured_output/__init__.py +113 -0
- vllm/v1/structured_output/backend_guidance.py +215 -0
- vllm/v1/structured_output/backend_types.py +96 -0
- vllm/v1/structured_output/backend_xgrammar.py +299 -0
- vllm/v1/structured_output/request.py +84 -0
- vllm/v1/structured_output/utils.py +174 -0
- vllm/v1/utils.py +249 -0
- vllm/v1/worker/__init__.py +0 -0
- vllm/v1/worker/block_table.py +87 -0
- vllm/v1/worker/gpu_input_batch.py +677 -0
- vllm/v1/worker/gpu_model_runner.py +1776 -0
- vllm/v1/worker/gpu_worker.py +349 -0
- vllm/v1/worker/lora_model_runner_mixin.py +145 -0
- vllm/v1/worker/tpu_model_runner.py +1419 -0
- vllm/v1/worker/tpu_worker.py +260 -0
- vllm/v1/worker/utils.py +74 -0
- vllm/v1/worker/worker_base.py +64 -0
- vllm/version.py +40 -0
- vllm/vllm_flash_attn/.gitkeep +0 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +144 -0
- vllm/worker/cpu_enc_dec_model_runner.py +323 -0
- vllm/worker/cpu_model_runner.py +668 -0
- vllm/worker/cpu_pooling_model_runner.py +122 -0
- vllm/worker/cpu_worker.py +400 -0
- vllm/worker/enc_dec_model_runner.py +542 -0
- vllm/worker/hpu_model_runner.py +2221 -0
- vllm/worker/hpu_worker.py +483 -0
- vllm/worker/model_runner.py +2056 -0
- vllm/worker/model_runner_base.py +281 -0
- vllm/worker/multi_step_hpu_worker.py +122 -0
- vllm/worker/multi_step_model_runner.py +908 -0
- vllm/worker/multi_step_tpu_worker.py +107 -0
- vllm/worker/multi_step_worker.py +196 -0
- vllm/worker/neuron_model_runner.py +336 -0
- vllm/worker/neuron_worker.py +138 -0
- vllm/worker/pooling_model_runner.py +200 -0
- vllm/worker/tpu_model_runner.py +908 -0
- vllm/worker/tpu_worker.py +332 -0
- vllm/worker/utils.py +52 -0
- vllm/worker/worker.py +570 -0
- vllm/worker/worker_base.py +644 -0
- vllm/worker/xpu_model_runner.py +603 -0
- vllm/worker/xpu_worker.py +185 -0
- vllm_cpu-0.8.5.post2.dist-info/METADATA +309 -0
- vllm_cpu-0.8.5.post2.dist-info/RECORD +1103 -0
- vllm_cpu-0.8.5.post2.dist-info/WHEEL +5 -0
- vllm_cpu-0.8.5.post2.dist-info/entry_points.txt +2 -0
- vllm_cpu-0.8.5.post2.dist-info/top_level.txt +1 -0
vllm/core/scheduler.py
ADDED
|
@@ -0,0 +1,2060 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
import os
|
|
5
|
+
import random
|
|
6
|
+
import time
|
|
7
|
+
from collections import deque
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Callable, Deque, Dict, Iterable, List, Optional
|
|
10
|
+
from typing import Sequence as GenericSequence
|
|
11
|
+
from typing import Set, Tuple, Union
|
|
12
|
+
|
|
13
|
+
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
|
14
|
+
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
|
15
|
+
from vllm.logger import init_logger
|
|
16
|
+
from vllm.lora.request import LoRARequest
|
|
17
|
+
from vllm.prompt_adapter.request import PromptAdapterRequest
|
|
18
|
+
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
|
19
|
+
SequenceGroupBase, SequenceGroupMetadata,
|
|
20
|
+
SequenceGroupMetadataDelta, SequenceStage,
|
|
21
|
+
SequenceStatus)
|
|
22
|
+
from vllm.utils import Device, PyObjectCache
|
|
23
|
+
|
|
24
|
+
logger = init_logger(__name__)
|
|
25
|
+
|
|
26
|
+
# Test-only. If configured, decode is preempted with
|
|
27
|
+
# ARTIFICIAL_PREEMPTION_PROB% probability.
|
|
28
|
+
ENABLE_ARTIFICIAL_PREEMPT = bool(
|
|
29
|
+
os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa
|
|
30
|
+
ARTIFICIAL_PREEMPTION_PROB = 0.5
|
|
31
|
+
ARTIFICIAL_PREEMPTION_MAX_CNT = 500
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PreemptionMode(enum.Enum):
|
|
35
|
+
"""Preemption modes.
|
|
36
|
+
|
|
37
|
+
1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
|
|
38
|
+
and swap them back in when the sequences are resumed.
|
|
39
|
+
2. Recomputation: Discard the blocks of the preempted sequences and
|
|
40
|
+
recompute them when the sequences are resumed, treating the sequences as
|
|
41
|
+
new prompts.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
SWAP = enum.auto()
|
|
45
|
+
RECOMPUTE = enum.auto()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class SchedulingBudget:
|
|
50
|
+
"""The available slots for scheduling.
|
|
51
|
+
|
|
52
|
+
TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
|
|
53
|
+
budget update from the same request_id. It is because in normal scheduling
|
|
54
|
+
path, we update RUNNING num_seqs ahead of time, meaning it could be
|
|
55
|
+
updated more than once when scheduling RUNNING requests. Since this won't
|
|
56
|
+
happen if we only have chunked prefill scheduling, we can remove this
|
|
57
|
+
feature from the API when chunked prefill is enabled by default.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
token_budget: int
|
|
61
|
+
max_num_seqs: int
|
|
62
|
+
_request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
|
|
63
|
+
_request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
|
|
64
|
+
# Number of cached tokens in the batch.
|
|
65
|
+
_num_cached_tokens: int = 0
|
|
66
|
+
# Number of actual non-cached tokens in the batch.
|
|
67
|
+
_num_batched_tokens: int = 0
|
|
68
|
+
_num_curr_seqs: int = 0
|
|
69
|
+
|
|
70
|
+
def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
|
|
71
|
+
# We allow num_new_tokens to be 0 when the entire sequence has
|
|
72
|
+
# been cached.
|
|
73
|
+
assert num_new_tokens >= 0
|
|
74
|
+
assert num_new_seqs != 0
|
|
75
|
+
return (self.num_batched_tokens + num_new_tokens <= self.token_budget
|
|
76
|
+
and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
|
|
77
|
+
|
|
78
|
+
def remaining_token_budget(self):
|
|
79
|
+
return self.token_budget - self.num_batched_tokens
|
|
80
|
+
|
|
81
|
+
def add_num_batched_tokens(self,
|
|
82
|
+
req_id: str,
|
|
83
|
+
num_batched_tokens: int,
|
|
84
|
+
num_cached_tokens: int = 0):
|
|
85
|
+
if req_id in self._request_ids_num_batched_tokens:
|
|
86
|
+
return
|
|
87
|
+
assert num_cached_tokens >= 0
|
|
88
|
+
assert num_batched_tokens >= 0
|
|
89
|
+
|
|
90
|
+
self._request_ids_num_batched_tokens.add(req_id)
|
|
91
|
+
self._num_batched_tokens += num_batched_tokens
|
|
92
|
+
self._num_cached_tokens += num_cached_tokens
|
|
93
|
+
|
|
94
|
+
def subtract_num_batched_tokens(self, req_id: str,
|
|
95
|
+
num_batched_tokens: int):
|
|
96
|
+
if req_id in self._request_ids_num_batched_tokens:
|
|
97
|
+
self._request_ids_num_batched_tokens.remove(req_id)
|
|
98
|
+
self._num_batched_tokens -= num_batched_tokens
|
|
99
|
+
|
|
100
|
+
def add_num_seqs(self, req_id: str, num_curr_seqs: int):
|
|
101
|
+
if req_id in self._request_ids_num_curr_seqs:
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
self._request_ids_num_curr_seqs.add(req_id)
|
|
105
|
+
self._num_curr_seqs += num_curr_seqs
|
|
106
|
+
|
|
107
|
+
def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
|
|
108
|
+
if req_id in self._request_ids_num_curr_seqs:
|
|
109
|
+
self._request_ids_num_curr_seqs.remove(req_id)
|
|
110
|
+
self._num_curr_seqs -= num_curr_seqs
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def num_batched_tokens(self):
|
|
114
|
+
return self._num_batched_tokens
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def num_curr_seqs(self):
|
|
118
|
+
return self._num_curr_seqs
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def num_cached_tokens(self):
|
|
122
|
+
return self._num_cached_tokens
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass
|
|
126
|
+
class ScheduledSequenceGroup:
|
|
127
|
+
# A sequence group that's scheduled.
|
|
128
|
+
seq_group: SequenceGroup
|
|
129
|
+
# The total chunk size (number of tokens) to process for next iteration.
|
|
130
|
+
# 1 for decoding. Same as prompt tokens for prefill, but if prefill is
|
|
131
|
+
# chunked, it can be smaller than that.
|
|
132
|
+
token_chunk_size: int
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass
|
|
136
|
+
class SchedulerOutputs:
|
|
137
|
+
"""The scheduling decision made from a scheduler."""
|
|
138
|
+
|
|
139
|
+
# Scheduled sequence groups.
|
|
140
|
+
scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
|
|
141
|
+
# Number of prefill groups scheduled.
|
|
142
|
+
num_prefill_groups: int
|
|
143
|
+
# Total number of batched tokens.
|
|
144
|
+
num_batched_tokens: int
|
|
145
|
+
# Blocks to swap in. List of CPU -> GPU block number.
|
|
146
|
+
blocks_to_swap_in: List[Tuple[int, int]]
|
|
147
|
+
# Blocks to swap out. List of GPU -> CPU block number.
|
|
148
|
+
blocks_to_swap_out: List[Tuple[int, int]]
|
|
149
|
+
# Blocks to copy. Source to dest block.
|
|
150
|
+
blocks_to_copy: List[Tuple[int, int]]
|
|
151
|
+
# Sequence groups that are going to be ignored.
|
|
152
|
+
ignored_seq_groups: List[SequenceGroup]
|
|
153
|
+
# The number of slots for lookahead decoding.
|
|
154
|
+
num_lookahead_slots: int
|
|
155
|
+
# The number of requests in the running queue
|
|
156
|
+
running_queue_size: int
|
|
157
|
+
preempted: int
|
|
158
|
+
|
|
159
|
+
def __post_init__(self):
|
|
160
|
+
# Swap in and swap out should never happen at the same time.
|
|
161
|
+
assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
|
|
162
|
+
|
|
163
|
+
self.num_loras: int = len(self.lora_requests)
|
|
164
|
+
if self.num_loras > 0:
|
|
165
|
+
self._sort_by_lora_ids()
|
|
166
|
+
|
|
167
|
+
self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
|
|
168
|
+
|
|
169
|
+
def is_empty(self) -> bool:
|
|
170
|
+
# NOTE: We do not consider the ignored sequence groups.
|
|
171
|
+
return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
|
|
172
|
+
and not self.blocks_to_swap_out and not self.blocks_to_copy)
|
|
173
|
+
|
|
174
|
+
def _sort_by_lora_ids(self):
|
|
175
|
+
assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
|
|
176
|
+
|
|
177
|
+
def key_fn(group: ScheduledSequenceGroup):
|
|
178
|
+
key = (group.seq_group.lora_int_id, group.seq_group.request_id)
|
|
179
|
+
if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
|
|
180
|
+
# Sort sequence groups so that all prefills come before all
|
|
181
|
+
# decodes as required by chunked prefill.
|
|
182
|
+
return (not group.seq_group.is_prefill(), *key)
|
|
183
|
+
return key
|
|
184
|
+
|
|
185
|
+
self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
|
|
186
|
+
key=key_fn)
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def lora_requests(self) -> Set[LoRARequest]:
|
|
190
|
+
return {
|
|
191
|
+
g.seq_group.lora_request
|
|
192
|
+
for g in self.scheduled_seq_groups
|
|
193
|
+
if g.seq_group.lora_request is not None
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
@property
|
|
197
|
+
def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
|
|
198
|
+
return {
|
|
199
|
+
g.seq_group.prompt_adapter_request
|
|
200
|
+
for g in self.scheduled_seq_groups
|
|
201
|
+
if g.seq_group.prompt_adapter_request is not None
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@dataclass
|
|
206
|
+
class SchedulerRunningOutputs:
|
|
207
|
+
"""The requests that are scheduled from a running queue.
|
|
208
|
+
|
|
209
|
+
Could contain prefill (prefill that's chunked) or decodes. If there's not
|
|
210
|
+
enough memory, it can be preempted (for recompute) or swapped out.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
# Selected sequences that are running and in a decoding phase.
|
|
214
|
+
decode_seq_groups: List[ScheduledSequenceGroup]
|
|
215
|
+
# Selected sequences that are running and in a prefill phase.
|
|
216
|
+
# I.e., it means the prefill has been chunked.
|
|
217
|
+
prefill_seq_groups: List[ScheduledSequenceGroup]
|
|
218
|
+
# The preempted sequences.
|
|
219
|
+
preempted: List[SequenceGroup]
|
|
220
|
+
# Sequences that are swapped out.
|
|
221
|
+
swapped_out: List[SequenceGroup]
|
|
222
|
+
# The blocks to swap out.
|
|
223
|
+
blocks_to_swap_out: List[Tuple[int, int]]
|
|
224
|
+
# The blocks to copy.
|
|
225
|
+
blocks_to_copy: List[Tuple[int, int]]
|
|
226
|
+
# The number of slots for lookahead decoding.
|
|
227
|
+
num_lookahead_slots: int
|
|
228
|
+
|
|
229
|
+
# Optimization for fast-access to seq_group lists
|
|
230
|
+
decode_seq_groups_list: List[SequenceGroup]
|
|
231
|
+
prefill_seq_groups_list: List[SequenceGroup]
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
def create_empty(cls) -> "SchedulerRunningOutputs":
|
|
235
|
+
return SchedulerRunningOutputs(
|
|
236
|
+
decode_seq_groups=[],
|
|
237
|
+
prefill_seq_groups=[],
|
|
238
|
+
preempted=[],
|
|
239
|
+
swapped_out=[],
|
|
240
|
+
blocks_to_swap_out=[],
|
|
241
|
+
blocks_to_copy=[],
|
|
242
|
+
num_lookahead_slots=0,
|
|
243
|
+
decode_seq_groups_list=[],
|
|
244
|
+
prefill_seq_groups_list=[],
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@dataclass
|
|
249
|
+
class SchedulerSwappedInOutputs:
|
|
250
|
+
"""The requests that are scheduled from a swap queue.
|
|
251
|
+
|
|
252
|
+
Could contain prefill (prefill that's chunked) or decodes.
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
# Selected sequences that are going to be swapped in and is in a
|
|
256
|
+
# decoding phase.
|
|
257
|
+
decode_seq_groups: List[ScheduledSequenceGroup]
|
|
258
|
+
# Selected sequences that are going to be swapped in and in a prefill
|
|
259
|
+
# phase. I.e., it means the prefill has been chunked.
|
|
260
|
+
prefill_seq_groups: List[ScheduledSequenceGroup]
|
|
261
|
+
# The blocks to swap in.
|
|
262
|
+
blocks_to_swap_in: List[Tuple[int, int]]
|
|
263
|
+
# The blocks to copy.
|
|
264
|
+
blocks_to_copy: List[Tuple[int, int]]
|
|
265
|
+
# The number of slots for lookahead decoding.
|
|
266
|
+
num_lookahead_slots: int
|
|
267
|
+
# Infeasible sequence groups.
|
|
268
|
+
infeasible_seq_groups: List[SequenceGroup]
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def create_empty(cls) -> "SchedulerSwappedInOutputs":
|
|
272
|
+
return SchedulerSwappedInOutputs(
|
|
273
|
+
decode_seq_groups=[],
|
|
274
|
+
prefill_seq_groups=[],
|
|
275
|
+
blocks_to_swap_in=[],
|
|
276
|
+
blocks_to_copy=[],
|
|
277
|
+
num_lookahead_slots=0,
|
|
278
|
+
infeasible_seq_groups=[],
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class SchedulerPrefillOutputs:
|
|
284
|
+
"""The requests that are scheduled from a waiting queue.
|
|
285
|
+
|
|
286
|
+
Could contain a fresh prefill requests or preempted requests that need
|
|
287
|
+
to be recomputed from scratch.
|
|
288
|
+
"""
|
|
289
|
+
|
|
290
|
+
# Selected sequences for prefill.
|
|
291
|
+
seq_groups: List[ScheduledSequenceGroup]
|
|
292
|
+
# Ignored sequence groups.
|
|
293
|
+
ignored_seq_groups: List[SequenceGroup]
|
|
294
|
+
num_lookahead_slots: int
|
|
295
|
+
|
|
296
|
+
@classmethod
|
|
297
|
+
def create_empty(cls) -> "SchedulerPrefillOutputs":
|
|
298
|
+
return SchedulerPrefillOutputs(
|
|
299
|
+
seq_groups=[],
|
|
300
|
+
ignored_seq_groups=[],
|
|
301
|
+
num_lookahead_slots=0,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def seq_group_metadata_builder():
|
|
306
|
+
return SequenceGroupMetadata(request_id="",
|
|
307
|
+
is_prompt=False,
|
|
308
|
+
seq_data={},
|
|
309
|
+
sampling_params=None,
|
|
310
|
+
block_tables={})
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def scheduler_running_outputs_builder():
|
|
314
|
+
return SchedulerRunningOutputs(decode_seq_groups=[],
|
|
315
|
+
prefill_seq_groups=[],
|
|
316
|
+
preempted=[],
|
|
317
|
+
swapped_out=[],
|
|
318
|
+
blocks_to_swap_out=[],
|
|
319
|
+
blocks_to_copy=[],
|
|
320
|
+
num_lookahead_slots=0,
|
|
321
|
+
prefill_seq_groups_list=[],
|
|
322
|
+
decode_seq_groups_list=[])
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def scheduled_seq_group_builder():
|
|
326
|
+
return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
|
|
327
|
+
token_chunk_size=0)
|
|
328
|
+
# return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@dataclass
|
|
332
|
+
class PartialPrefillMetadata:
|
|
333
|
+
"""Holds information about the partial prefills that are currently running
|
|
334
|
+
during a single iteration of the Scheduler.
|
|
335
|
+
When chunked prefill is enabled, we allow a certain number of seqs to be
|
|
336
|
+
partially prefilled during each iteration. Having multiple partial prefills
|
|
337
|
+
in flight allows us to minimize TTFT and avoid decode starvation in cases
|
|
338
|
+
where a single sequence group with a very large prompt blocks the queue for
|
|
339
|
+
too many iterations.
|
|
340
|
+
The number of long prefill requests is limited so that smaller
|
|
341
|
+
requests may jump the queue in front of them and get to the decode
|
|
342
|
+
phase faster.
|
|
343
|
+
"""
|
|
344
|
+
|
|
345
|
+
# A minimum bound on the total number of prefills to be scheduled during
|
|
346
|
+
# this iteration
|
|
347
|
+
schedulable_prefills: int
|
|
348
|
+
|
|
349
|
+
# The number of long prefill requests currently running
|
|
350
|
+
long_prefills: int
|
|
351
|
+
|
|
352
|
+
scheduler_config: SchedulerConfig
|
|
353
|
+
|
|
354
|
+
def can_schedule(self, seq_group: SequenceGroup) -> bool:
|
|
355
|
+
"""When concurrent partial prefills are enabled,
|
|
356
|
+
we limit the number of long requests and only accept
|
|
357
|
+
shorter requests from the queue while running them
|
|
358
|
+
concurrently"""
|
|
359
|
+
return not (seq_group.first_seq.get_num_new_tokens()
|
|
360
|
+
> self.scheduler_config.long_prefill_token_threshold
|
|
361
|
+
and self.long_prefills
|
|
362
|
+
>= self.scheduler_config.max_long_partial_prefills
|
|
363
|
+
and self.scheduler_config.max_num_partial_prefills > 1)
|
|
364
|
+
|
|
365
|
+
def maybe_increment_partial_prefills(self,
|
|
366
|
+
seq_group: SequenceGroup) -> None:
|
|
367
|
+
# When a new prefill is scheduled, we need to know if it is a
|
|
368
|
+
# long request
|
|
369
|
+
if (seq_group.first_seq.get_num_new_tokens()
|
|
370
|
+
> self.scheduler_config.long_prefill_token_threshold):
|
|
371
|
+
self.long_prefills += 1
|
|
372
|
+
|
|
373
|
+
@classmethod
|
|
374
|
+
def from_queues(
|
|
375
|
+
cls,
|
|
376
|
+
running: Deque[SequenceGroup],
|
|
377
|
+
waiting: Deque[SequenceGroup],
|
|
378
|
+
scheduler_config: SchedulerConfig,
|
|
379
|
+
) -> "PartialPrefillMetadata":
|
|
380
|
+
"""Create a PartialPrefillMetadata object from the current state of
|
|
381
|
+
the scheduler's queues.
|
|
382
|
+
This accounts for the currently running prefill requests, and peeks into
|
|
383
|
+
the waiting queue to see if there are more prefills to potentially be
|
|
384
|
+
scheduled during this iteration."""
|
|
385
|
+
prefills = 0
|
|
386
|
+
long_prefills = 0
|
|
387
|
+
|
|
388
|
+
waiting_long_prefills = 0
|
|
389
|
+
|
|
390
|
+
for sg in running:
|
|
391
|
+
if sg.first_seq.data.stage == SequenceStage.PREFILL:
|
|
392
|
+
prefills += 1
|
|
393
|
+
if (sg.first_seq.get_num_new_tokens()
|
|
394
|
+
> scheduler_config.long_prefill_token_threshold):
|
|
395
|
+
long_prefills += 1
|
|
396
|
+
|
|
397
|
+
for sg in waiting:
|
|
398
|
+
# Don't bother looping through the rest of the queue if we know
|
|
399
|
+
# there are already at
|
|
400
|
+
# least max_partial_prefills requests to fill
|
|
401
|
+
if prefills >= scheduler_config.max_num_partial_prefills:
|
|
402
|
+
break
|
|
403
|
+
|
|
404
|
+
# Don't count long requests from the waiting queue if we aren't
|
|
405
|
+
# going to schedule them anyway
|
|
406
|
+
if (sg.first_seq.get_num_new_tokens()
|
|
407
|
+
> scheduler_config.long_prefill_token_threshold):
|
|
408
|
+
if (long_prefills + waiting_long_prefills
|
|
409
|
+
>= scheduler_config.max_long_partial_prefills):
|
|
410
|
+
continue
|
|
411
|
+
waiting_long_prefills += 1
|
|
412
|
+
prefills += 1
|
|
413
|
+
|
|
414
|
+
# NB: long_prefills and waiting_long_prefills are tracked separately.
|
|
415
|
+
# We don't account for the waiting requests here because we need to use
|
|
416
|
+
# this metadata to track how many have actually been scheduled.
|
|
417
|
+
return PartialPrefillMetadata(
|
|
418
|
+
schedulable_prefills=min(
|
|
419
|
+
prefills, scheduler_config.max_num_partial_prefills),
|
|
420
|
+
long_prefills=long_prefills,
|
|
421
|
+
scheduler_config=scheduler_config,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
class Scheduler:
|
|
426
|
+
|
|
427
|
+
def __init__(
|
|
428
|
+
self,
|
|
429
|
+
scheduler_config: SchedulerConfig,
|
|
430
|
+
cache_config: CacheConfig,
|
|
431
|
+
lora_config: Optional[LoRAConfig],
|
|
432
|
+
pipeline_parallel_size: int = 1,
|
|
433
|
+
output_proc_callback: Optional[Callable] = None,
|
|
434
|
+
) -> None:
|
|
435
|
+
self.scheduler_config = scheduler_config
|
|
436
|
+
self.cache_config = cache_config
|
|
437
|
+
# Note for LoRA scheduling: the current policy is extremely
|
|
438
|
+
# simple and NOT fair. It can lead to starvation of some
|
|
439
|
+
# LoRAs. This should be improved in the future.
|
|
440
|
+
self.lora_config = lora_config
|
|
441
|
+
|
|
442
|
+
version = "selfattn"
|
|
443
|
+
if (self.scheduler_config.runner_type == "pooling"
|
|
444
|
+
or self.cache_config.is_attention_free):
|
|
445
|
+
version = "placeholder"
|
|
446
|
+
|
|
447
|
+
BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
|
|
448
|
+
version)
|
|
449
|
+
|
|
450
|
+
num_gpu_blocks = cache_config.num_gpu_blocks
|
|
451
|
+
if num_gpu_blocks:
|
|
452
|
+
num_gpu_blocks //= pipeline_parallel_size
|
|
453
|
+
|
|
454
|
+
num_cpu_blocks = cache_config.num_cpu_blocks
|
|
455
|
+
if num_cpu_blocks:
|
|
456
|
+
num_cpu_blocks //= pipeline_parallel_size
|
|
457
|
+
|
|
458
|
+
# Create the block space manager.
|
|
459
|
+
self.block_manager = BlockSpaceManagerImpl(
|
|
460
|
+
block_size=self.cache_config.block_size,
|
|
461
|
+
num_gpu_blocks=num_gpu_blocks,
|
|
462
|
+
num_cpu_blocks=num_cpu_blocks,
|
|
463
|
+
sliding_window=self.cache_config.sliding_window,
|
|
464
|
+
enable_caching=self.cache_config.enable_prefix_caching,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Sequence groups in the WAITING state.
|
|
468
|
+
# Contain new prefill or preempted requests.
|
|
469
|
+
self.waiting: Deque[SequenceGroup] = deque()
|
|
470
|
+
# Sequence groups in the RUNNING state.
|
|
471
|
+
# Contain decode requests.
|
|
472
|
+
self.running: Deque[SequenceGroup] = deque()
|
|
473
|
+
# Sequence groups in the SWAPPED state.
|
|
474
|
+
# Contain decode requests that are swapped out.
|
|
475
|
+
self.swapped: Deque[SequenceGroup] = deque()
|
|
476
|
+
# Sequence groups finished requests ids since last step iteration.
|
|
477
|
+
# It lets the model know that any state associated with these requests
|
|
478
|
+
# can and must be released after the current step.
|
|
479
|
+
# This is used to evict the finished requests from the Mamba cache.
|
|
480
|
+
self._finished_requests_ids: List[str] = list()
|
|
481
|
+
# Time at previous scheduling step
|
|
482
|
+
self.prev_time = 0.0
|
|
483
|
+
# Did we schedule a prompt at previous step?
|
|
484
|
+
self.prev_prompt = False
|
|
485
|
+
# Latency of the last prompt step
|
|
486
|
+
self.last_prompt_latency = 0.0
|
|
487
|
+
# preemption mode, RECOMPUTE or SWAP
|
|
488
|
+
self.user_specified_preemption_mode = scheduler_config.preemption_mode
|
|
489
|
+
|
|
490
|
+
# The following field is test-only. It is used to inject artificial
|
|
491
|
+
# preemption.
|
|
492
|
+
self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
|
|
493
|
+
self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
|
|
494
|
+
if self.enable_artificial_preemption
|
|
495
|
+
else 0)
|
|
496
|
+
self.num_cumulative_preemption: int = 0
|
|
497
|
+
|
|
498
|
+
# Used to cache python objects
|
|
499
|
+
self._seq_group_metadata_cache: List[PyObjectCache] = []
|
|
500
|
+
self._scheduler_running_outputs_cache: List[PyObjectCache] = []
|
|
501
|
+
self._scheduled_seq_group_cache: List[PyObjectCache] = []
|
|
502
|
+
|
|
503
|
+
# For async output processing, we need to swap cache buffers between
|
|
504
|
+
# iterations. I.e. since the output processing is lagged one step,
|
|
505
|
+
# we cannot reuse the cached objects immediately when the schedule()
|
|
506
|
+
# is called again, but only when schedule() is called the second time.
|
|
507
|
+
self.output_proc_callback = output_proc_callback
|
|
508
|
+
self.use_async_output_proc = self.output_proc_callback is not None
|
|
509
|
+
self.num_cache_iters = 2 if self.use_async_output_proc else 1
|
|
510
|
+
|
|
511
|
+
self.cache_id = 0
|
|
512
|
+
for i in range(self.num_cache_iters):
|
|
513
|
+
self._seq_group_metadata_cache.append(
|
|
514
|
+
PyObjectCache(seq_group_metadata_builder))
|
|
515
|
+
self._scheduler_running_outputs_cache.append(
|
|
516
|
+
PyObjectCache(scheduler_running_outputs_builder))
|
|
517
|
+
self._scheduled_seq_group_cache.append(
|
|
518
|
+
PyObjectCache(scheduled_seq_group_builder))
|
|
519
|
+
|
|
520
|
+
# For async postprocessor, the extra decode run cannot be done
|
|
521
|
+
# when the request reaches max_model_len. In this case, the request
|
|
522
|
+
# will be stopped during schedule() call and added to this stop list
|
|
523
|
+
# for processing and deallocation by the free_finished_seq_groups()
|
|
524
|
+
self._async_stopped: List[SequenceGroup] = []
|
|
525
|
+
|
|
526
|
+
# List with the chunk sizes to hand out to each sequence depending
|
|
527
|
+
# on how many partial prefills are running. This is slightly faster than
|
|
528
|
+
# running an integer division every time a prefill is scheduled.
|
|
529
|
+
# This splits the budget evenly among all prefills.
|
|
530
|
+
self.partial_prefill_budget_lookup_list = [0] * (
|
|
531
|
+
self.scheduler_config.max_num_partial_prefills + 1)
|
|
532
|
+
self.partial_prefill_budget_lookup_list[0] = (
|
|
533
|
+
scheduler_config.max_num_batched_tokens)
|
|
534
|
+
for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
|
|
535
|
+
self.partial_prefill_budget_lookup_list[i] = (
|
|
536
|
+
scheduler_config.max_num_batched_tokens // i)
|
|
537
|
+
|
|
538
|
+
@property
|
|
539
|
+
def next_cache_id(self):
|
|
540
|
+
return (self.cache_id + 1) % self.num_cache_iters
|
|
541
|
+
|
|
542
|
+
@property
|
|
543
|
+
def lora_enabled(self) -> bool:
|
|
544
|
+
return bool(self.lora_config)
|
|
545
|
+
|
|
546
|
+
@property
|
|
547
|
+
def num_decoding_tokens_per_seq(self) -> int:
|
|
548
|
+
"""The number of new tokens."""
|
|
549
|
+
return 1
|
|
550
|
+
|
|
551
|
+
def add_seq_group(self, seq_group: SequenceGroup) -> None:
|
|
552
|
+
# Add sequence groups to the waiting queue.
|
|
553
|
+
self.waiting.append(seq_group)
|
|
554
|
+
|
|
555
|
+
def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
|
|
556
|
+
# Add sequence groups to the running queue.
|
|
557
|
+
# Only for testing purposes.
|
|
558
|
+
self.running.append(seq_group)
|
|
559
|
+
|
|
560
|
+
def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
|
|
561
|
+
# Add sequence groups to the swapped queue.
|
|
562
|
+
# Only for testing purposes.
|
|
563
|
+
self.swapped.append(seq_group)
|
|
564
|
+
|
|
565
|
+
def abort_seq_group(
|
|
566
|
+
self,
|
|
567
|
+
request_id: Union[str, Iterable[str]],
|
|
568
|
+
seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
|
|
569
|
+
) -> None:
|
|
570
|
+
"""Aborts a sequence group with the given ID.
|
|
571
|
+
|
|
572
|
+
Check if the sequence group with the given ID
|
|
573
|
+
is present in any of the state queue.
|
|
574
|
+
If present, remove the sequence group from the state queue.
|
|
575
|
+
Also, if any of the sequences in the sequence group is not finished,
|
|
576
|
+
free the sequence with status `FINISHED_ABORTED`.
|
|
577
|
+
Otherwise, do nothing.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
request_id: The ID(s) of the sequence group to abort.
|
|
581
|
+
seq_id_to_seq_group: helper for groups with n>1
|
|
582
|
+
"""
|
|
583
|
+
if isinstance(request_id, str):
|
|
584
|
+
request_id = (request_id, )
|
|
585
|
+
request_ids = set(request_id)
|
|
586
|
+
seq_id_to_seq_group = seq_id_to_seq_group or {}
|
|
587
|
+
for state_queue in [self.waiting, self.running, self.swapped]:
|
|
588
|
+
aborted_groups: List[SequenceGroup] = []
|
|
589
|
+
for seq_group in state_queue:
|
|
590
|
+
# When n>1, seq_group.request_id looks like
|
|
591
|
+
# foo_parallel_sample_0, while request_ids is just foo, and we
|
|
592
|
+
# should resolve it as real_request_id to match.
|
|
593
|
+
if seq_group.request_id in seq_id_to_seq_group:
|
|
594
|
+
real_request_id = seq_id_to_seq_group[
|
|
595
|
+
seq_group.request_id].group_id
|
|
596
|
+
else:
|
|
597
|
+
real_request_id = seq_group.request_id
|
|
598
|
+
if real_request_id in request_ids:
|
|
599
|
+
# Appending aborted group into pending list.
|
|
600
|
+
aborted_groups.append(seq_group)
|
|
601
|
+
# We can't remove real_request_id in request_ids here,
|
|
602
|
+
# because there may be other seq groups sharing the same
|
|
603
|
+
# real_request_id
|
|
604
|
+
for aborted_group in aborted_groups:
|
|
605
|
+
# Remove the sequence group from the state queue.
|
|
606
|
+
state_queue.remove(aborted_group)
|
|
607
|
+
# Remove the aborted request from the Mamba cache.
|
|
608
|
+
self._finished_requests_ids.append(aborted_group.request_id)
|
|
609
|
+
for seq in aborted_group.get_seqs():
|
|
610
|
+
if seq.is_finished():
|
|
611
|
+
continue
|
|
612
|
+
seq.status = SequenceStatus.FINISHED_ABORTED
|
|
613
|
+
self.free_seq(seq)
|
|
614
|
+
if aborted_group.request_id in seq_id_to_seq_group:
|
|
615
|
+
del seq_id_to_seq_group[aborted_group.request_id]
|
|
616
|
+
|
|
617
|
+
self._free_seq_group_cross_attn_blocks(aborted_group)
|
|
618
|
+
|
|
619
|
+
def _free_seq_group_cross_attn_blocks(
|
|
620
|
+
self,
|
|
621
|
+
seq_group: SequenceGroup,
|
|
622
|
+
) -> None:
|
|
623
|
+
"""
|
|
624
|
+
Free a sequence group from a cross-attention block table.
|
|
625
|
+
Has no effect on decoder-only models.
|
|
626
|
+
"""
|
|
627
|
+
if seq_group.is_encoder_decoder():
|
|
628
|
+
self.block_manager.free_cross(seq_group)
|
|
629
|
+
|
|
630
|
+
def has_unfinished_seqs(self) -> bool:
|
|
631
|
+
return (len(self.waiting) != 0 or len(self.running) != 0
|
|
632
|
+
or len(self.swapped) != 0)
|
|
633
|
+
|
|
634
|
+
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
|
635
|
+
return self.block_manager.get_prefix_cache_hit_rate(device)
|
|
636
|
+
|
|
637
|
+
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
|
|
638
|
+
return self.block_manager.reset_prefix_cache(device)
|
|
639
|
+
|
|
640
|
+
def get_num_unfinished_seq_groups(self) -> int:
|
|
641
|
+
return len(self.waiting) + len(self.running) + len(self.swapped)
|
|
642
|
+
|
|
643
|
+
def get_and_reset_finished_requests_ids(self) -> List[str]:
|
|
644
|
+
"""Flushes the list of request ids of previously finished seq_groups."""
|
|
645
|
+
finished_requests_ids = self._finished_requests_ids
|
|
646
|
+
self._finished_requests_ids = list()
|
|
647
|
+
return finished_requests_ids
|
|
648
|
+
|
|
649
|
+
def _schedule_running(
|
|
650
|
+
self,
|
|
651
|
+
budget: SchedulingBudget,
|
|
652
|
+
curr_loras: Optional[Set[int]],
|
|
653
|
+
enable_chunking: bool = False,
|
|
654
|
+
partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
|
|
655
|
+
) -> SchedulerRunningOutputs:
|
|
656
|
+
"""Schedule sequence groups that are running.
|
|
657
|
+
|
|
658
|
+
Running queue should include decode and chunked prefill requests.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
budget: The scheduling budget. The argument is in-place updated
|
|
662
|
+
when any decodes are preempted.
|
|
663
|
+
curr_loras: Currently batched lora request ids. The argument is
|
|
664
|
+
in-place updated when any decodes are preempted.
|
|
665
|
+
enable_chunking: If True, seq group can be chunked and only a
|
|
666
|
+
chunked number of tokens are scheduled if
|
|
667
|
+
`budget.num_batched_tokens` has not enough capacity to schedule
|
|
668
|
+
all tokens.
|
|
669
|
+
partial_prefill_metadata: information about the partial prefills
|
|
670
|
+
that are currently running
|
|
671
|
+
|
|
672
|
+
Returns:
|
|
673
|
+
SchedulerRunningOutputs.
|
|
674
|
+
"""
|
|
675
|
+
ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
|
|
676
|
+
self.cache_id].get_object()
|
|
677
|
+
ret.blocks_to_swap_out.clear()
|
|
678
|
+
ret.blocks_to_copy.clear()
|
|
679
|
+
ret.decode_seq_groups.clear()
|
|
680
|
+
ret.prefill_seq_groups.clear()
|
|
681
|
+
ret.preempted.clear()
|
|
682
|
+
ret.swapped_out.clear()
|
|
683
|
+
|
|
684
|
+
ret.num_lookahead_slots = self._get_num_lookahead_slots(
|
|
685
|
+
is_prefill=False, enable_chunking=enable_chunking)
|
|
686
|
+
|
|
687
|
+
ret.decode_seq_groups_list.clear()
|
|
688
|
+
ret.prefill_seq_groups_list.clear()
|
|
689
|
+
|
|
690
|
+
# Blocks that need to be swapped or copied before model execution.
|
|
691
|
+
blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
|
|
692
|
+
blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
|
|
693
|
+
|
|
694
|
+
decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
|
|
695
|
+
prefill_seq_groups: List[
|
|
696
|
+
ScheduledSequenceGroup] = ret.prefill_seq_groups
|
|
697
|
+
preempted: List[SequenceGroup] = ret.preempted
|
|
698
|
+
swapped_out: List[SequenceGroup] = ret.swapped_out
|
|
699
|
+
|
|
700
|
+
running_queue = self.running
|
|
701
|
+
assert len(self._async_stopped) == 0
|
|
702
|
+
while running_queue:
|
|
703
|
+
seq_group = running_queue[0]
|
|
704
|
+
# We discard the cached tokens info here because we don't need it
|
|
705
|
+
# for running sequence:
|
|
706
|
+
# 1. If a sequence is running with chunked prefill, the cached
|
|
707
|
+
# tokens info was already used for the first prefill.
|
|
708
|
+
# 2. If a sequence is running with non-chunked prefill, then
|
|
709
|
+
# there it's a decoding sequence, and the cached tokens info is
|
|
710
|
+
# irrelevant.
|
|
711
|
+
num_uncached_new_tokens, _ = \
|
|
712
|
+
self._get_num_new_uncached_and_cached_tokens(
|
|
713
|
+
seq_group,
|
|
714
|
+
SequenceStatus.RUNNING,
|
|
715
|
+
enable_chunking,
|
|
716
|
+
budget,
|
|
717
|
+
partial_prefill_metadata,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
num_running_tokens = num_uncached_new_tokens
|
|
721
|
+
if num_running_tokens == 0:
|
|
722
|
+
# No budget => Stop
|
|
723
|
+
break
|
|
724
|
+
|
|
725
|
+
running_queue.popleft()
|
|
726
|
+
|
|
727
|
+
# With async postprocessor, an extra decode run is done
|
|
728
|
+
# to process the final tokens. The check below avoids this extra
|
|
729
|
+
# decode run when the model max len is reached, in order to avoid
|
|
730
|
+
# a memory overflow.
|
|
731
|
+
if (self.use_async_output_proc and seq_group.seqs[0].get_len()
|
|
732
|
+
> self.scheduler_config.max_model_len):
|
|
733
|
+
self._async_stopped.append(seq_group)
|
|
734
|
+
continue
|
|
735
|
+
|
|
736
|
+
# NOTE(woosuk): Preemption happens only when there is no available
|
|
737
|
+
# slot to keep all the sequence groups in the RUNNING state.
|
|
738
|
+
while not self._can_append_slots(seq_group, enable_chunking):
|
|
739
|
+
budget.subtract_num_batched_tokens(seq_group.request_id,
|
|
740
|
+
num_running_tokens)
|
|
741
|
+
num_running_seqs = seq_group.get_max_num_running_seqs()
|
|
742
|
+
budget.subtract_num_seqs(seq_group.request_id,
|
|
743
|
+
num_running_seqs)
|
|
744
|
+
|
|
745
|
+
if (curr_loras is not None and seq_group.lora_int_id > 0
|
|
746
|
+
and seq_group.lora_int_id in curr_loras):
|
|
747
|
+
curr_loras.remove(seq_group.lora_int_id)
|
|
748
|
+
|
|
749
|
+
# Determine victim sequence
|
|
750
|
+
cont_loop = True
|
|
751
|
+
if running_queue:
|
|
752
|
+
# Preempt the lowest-priority sequence group.
|
|
753
|
+
victim_seq_group = running_queue.pop()
|
|
754
|
+
else:
|
|
755
|
+
# No other sequence group can be preempted.
|
|
756
|
+
# Preempt the current sequence group.
|
|
757
|
+
# Note: This is also where we stop this loop
|
|
758
|
+
# (since there is nothing else to preempt)
|
|
759
|
+
victim_seq_group = seq_group
|
|
760
|
+
cont_loop = False
|
|
761
|
+
|
|
762
|
+
# With async postprocessor, before preempting a sequence
|
|
763
|
+
# we need to ensure it has no pending async postprocessor
|
|
764
|
+
do_preempt = True
|
|
765
|
+
if self.use_async_output_proc:
|
|
766
|
+
assert self.output_proc_callback is not None
|
|
767
|
+
self.output_proc_callback(
|
|
768
|
+
request_id=victim_seq_group.request_id)
|
|
769
|
+
|
|
770
|
+
# It may be that the async pending "victim_seq_group"
|
|
771
|
+
# becomes finished, in which case we simply free it.
|
|
772
|
+
if victim_seq_group.is_finished():
|
|
773
|
+
self._free_finished_seq_group(victim_seq_group)
|
|
774
|
+
do_preempt = False
|
|
775
|
+
|
|
776
|
+
# Do preemption
|
|
777
|
+
if do_preempt:
|
|
778
|
+
preempted_mode = self._preempt(victim_seq_group,
|
|
779
|
+
blocks_to_swap_out)
|
|
780
|
+
if preempted_mode == PreemptionMode.RECOMPUTE:
|
|
781
|
+
preempted.append(victim_seq_group)
|
|
782
|
+
else:
|
|
783
|
+
swapped_out.append(victim_seq_group)
|
|
784
|
+
|
|
785
|
+
if not cont_loop:
|
|
786
|
+
break
|
|
787
|
+
else:
|
|
788
|
+
self._append_slots(seq_group, blocks_to_copy, enable_chunking)
|
|
789
|
+
is_prefill = seq_group.is_prefill()
|
|
790
|
+
|
|
791
|
+
scheduled_seq_group: ScheduledSequenceGroup = (
|
|
792
|
+
self._scheduled_seq_group_cache[
|
|
793
|
+
self.cache_id].get_object())
|
|
794
|
+
scheduled_seq_group.seq_group = seq_group
|
|
795
|
+
if is_prefill:
|
|
796
|
+
scheduled_seq_group.token_chunk_size = num_running_tokens
|
|
797
|
+
prefill_seq_groups.append(scheduled_seq_group)
|
|
798
|
+
ret.prefill_seq_groups_list.append(seq_group)
|
|
799
|
+
else:
|
|
800
|
+
scheduled_seq_group.token_chunk_size = 1
|
|
801
|
+
decode_seq_groups.append(scheduled_seq_group)
|
|
802
|
+
ret.decode_seq_groups_list.append(seq_group)
|
|
803
|
+
|
|
804
|
+
budget.add_num_batched_tokens(seq_group.request_id,
|
|
805
|
+
num_running_tokens)
|
|
806
|
+
# OPTIMIZATION: Note that get_max_num_running_seqs is
|
|
807
|
+
# expensive. For the default scheduling chase where
|
|
808
|
+
# enable_chunking is False, num_seqs are updated before running
|
|
809
|
+
# this method, so we don't have to update it again here.
|
|
810
|
+
if enable_chunking:
|
|
811
|
+
num_running_seqs = seq_group.get_max_num_running_seqs()
|
|
812
|
+
budget.add_num_seqs(seq_group.request_id, num_running_seqs)
|
|
813
|
+
if curr_loras is not None and seq_group.lora_int_id > 0:
|
|
814
|
+
curr_loras.add(seq_group.lora_int_id)
|
|
815
|
+
|
|
816
|
+
self._scheduler_running_outputs_cache[self.next_cache_id].reset()
|
|
817
|
+
self._scheduled_seq_group_cache[self.next_cache_id].reset()
|
|
818
|
+
|
|
819
|
+
return ret
|
|
820
|
+
|
|
821
|
+
def _schedule_swapped(
|
|
822
|
+
self,
|
|
823
|
+
budget: SchedulingBudget,
|
|
824
|
+
curr_loras: Optional[Set[int]],
|
|
825
|
+
enable_chunking: bool = False,
|
|
826
|
+
) -> SchedulerSwappedInOutputs:
|
|
827
|
+
"""Schedule sequence groups that are swapped out.
|
|
828
|
+
|
|
829
|
+
It schedules swapped requests as long as it fits `budget` and
|
|
830
|
+
curr_loras <= max_lora from the scheduling config. The input arguments
|
|
831
|
+
`budget` and `curr_loras` are updated based on scheduled seq_groups.
|
|
832
|
+
|
|
833
|
+
Args:
|
|
834
|
+
budget: The scheduling budget. The argument is in-place updated
|
|
835
|
+
when any requests are swapped in.
|
|
836
|
+
curr_loras: Currently batched lora request ids. The argument is
|
|
837
|
+
in-place updated when any requests are swapped in.
|
|
838
|
+
enable_chunking: If True, seq group can be chunked and only a
|
|
839
|
+
chunked number of tokens are scheduled if
|
|
840
|
+
`budget.num_batched_tokens` has not enough capacity to schedule
|
|
841
|
+
all tokens.
|
|
842
|
+
|
|
843
|
+
Returns:
|
|
844
|
+
SchedulerSwappedInOutputs.
|
|
845
|
+
"""
|
|
846
|
+
# Blocks that need to be swapped or copied before model execution.
|
|
847
|
+
blocks_to_swap_in: List[Tuple[int, int]] = []
|
|
848
|
+
blocks_to_copy: List[Tuple[int, int]] = []
|
|
849
|
+
decode_seq_groups: List[ScheduledSequenceGroup] = []
|
|
850
|
+
prefill_seq_groups: List[ScheduledSequenceGroup] = []
|
|
851
|
+
infeasible_seq_groups: List[SequenceGroup] = []
|
|
852
|
+
|
|
853
|
+
swapped_queue = self.swapped
|
|
854
|
+
|
|
855
|
+
leftover_swapped: Deque[SequenceGroup] = deque()
|
|
856
|
+
while swapped_queue:
|
|
857
|
+
seq_group = swapped_queue[0]
|
|
858
|
+
|
|
859
|
+
# If the sequence group cannot be swapped in, stop.
|
|
860
|
+
is_prefill = seq_group.is_prefill()
|
|
861
|
+
alloc_status = self.block_manager.can_swap_in(
|
|
862
|
+
seq_group,
|
|
863
|
+
self._get_num_lookahead_slots(is_prefill, enable_chunking))
|
|
864
|
+
if alloc_status == AllocStatus.LATER:
|
|
865
|
+
break
|
|
866
|
+
elif alloc_status == AllocStatus.NEVER:
|
|
867
|
+
logger.warning(
|
|
868
|
+
"Failing the request %s because there's not enough kv "
|
|
869
|
+
"cache blocks to run the entire sequence.",
|
|
870
|
+
seq_group.request_id,
|
|
871
|
+
)
|
|
872
|
+
for seq in seq_group.get_seqs():
|
|
873
|
+
seq.status = SequenceStatus.FINISHED_IGNORED
|
|
874
|
+
infeasible_seq_groups.append(seq_group)
|
|
875
|
+
swapped_queue.popleft()
|
|
876
|
+
continue
|
|
877
|
+
|
|
878
|
+
lora_int_id = 0
|
|
879
|
+
if self.lora_enabled:
|
|
880
|
+
lora_int_id = seq_group.lora_int_id
|
|
881
|
+
assert curr_loras is not None
|
|
882
|
+
assert self.lora_config is not None
|
|
883
|
+
if (lora_int_id > 0 and (lora_int_id not in curr_loras)
|
|
884
|
+
and len(curr_loras) >= self.lora_config.max_loras):
|
|
885
|
+
# We don't have a space for another LoRA, so
|
|
886
|
+
# we ignore this request for now.
|
|
887
|
+
leftover_swapped.appendleft(seq_group)
|
|
888
|
+
swapped_queue.popleft()
|
|
889
|
+
continue
|
|
890
|
+
|
|
891
|
+
# The total number of sequences in the RUNNING state should not
|
|
892
|
+
# exceed the maximum number of sequences.
|
|
893
|
+
num_new_seqs = seq_group.get_max_num_running_seqs()
|
|
894
|
+
num_new_tokens_uncached, num_new_tokens_cached = (
|
|
895
|
+
self._get_num_new_uncached_and_cached_tokens(
|
|
896
|
+
seq_group, SequenceStatus.SWAPPED, enable_chunking,
|
|
897
|
+
budget))
|
|
898
|
+
|
|
899
|
+
if num_new_tokens_uncached == 0 or not budget.can_schedule(
|
|
900
|
+
num_new_tokens=num_new_tokens_uncached,
|
|
901
|
+
num_new_seqs=num_new_seqs,
|
|
902
|
+
):
|
|
903
|
+
break
|
|
904
|
+
|
|
905
|
+
if lora_int_id > 0 and curr_loras is not None:
|
|
906
|
+
curr_loras.add(lora_int_id)
|
|
907
|
+
swapped_queue.popleft()
|
|
908
|
+
self._swap_in(seq_group, blocks_to_swap_in)
|
|
909
|
+
self._append_slots(seq_group, blocks_to_copy, enable_chunking)
|
|
910
|
+
if is_prefill:
|
|
911
|
+
prefill_seq_groups.append(
|
|
912
|
+
ScheduledSequenceGroup(
|
|
913
|
+
seq_group,
|
|
914
|
+
token_chunk_size=num_new_tokens_uncached +
|
|
915
|
+
num_new_tokens_cached,
|
|
916
|
+
))
|
|
917
|
+
else:
|
|
918
|
+
decode_seq_groups.append(
|
|
919
|
+
ScheduledSequenceGroup(seq_group, token_chunk_size=1))
|
|
920
|
+
budget.add_num_batched_tokens(
|
|
921
|
+
seq_group.request_id,
|
|
922
|
+
num_batched_tokens=num_new_tokens_uncached,
|
|
923
|
+
num_cached_tokens=num_new_tokens_cached,
|
|
924
|
+
)
|
|
925
|
+
budget.add_num_seqs(seq_group.request_id, num_new_seqs)
|
|
926
|
+
|
|
927
|
+
swapped_queue.extendleft(leftover_swapped)
|
|
928
|
+
|
|
929
|
+
return SchedulerSwappedInOutputs(
|
|
930
|
+
decode_seq_groups=decode_seq_groups,
|
|
931
|
+
prefill_seq_groups=prefill_seq_groups,
|
|
932
|
+
blocks_to_swap_in=blocks_to_swap_in,
|
|
933
|
+
blocks_to_copy=blocks_to_copy,
|
|
934
|
+
num_lookahead_slots=self._get_num_lookahead_slots(
|
|
935
|
+
is_prefill=False, enable_chunking=enable_chunking),
|
|
936
|
+
infeasible_seq_groups=infeasible_seq_groups,
|
|
937
|
+
)
|
|
938
|
+
|
|
939
|
+
def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
|
|
940
|
+
if (self.scheduler_config.chunked_prefill_enabled
|
|
941
|
+
and not self.scheduler_config.is_multi_step):
|
|
942
|
+
prompt_limit = self.scheduler_config.max_model_len
|
|
943
|
+
else:
|
|
944
|
+
prompt_limit = min(
|
|
945
|
+
self.scheduler_config.max_model_len,
|
|
946
|
+
self.scheduler_config.max_num_batched_tokens,
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
# Model is fine tuned with long context. Return the fine tuned max_len.
|
|
950
|
+
if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
|
|
951
|
+
assert prompt_limit <= seq_group.lora_request.long_lora_max_len
|
|
952
|
+
return seq_group.lora_request.long_lora_max_len
|
|
953
|
+
else:
|
|
954
|
+
return prompt_limit
|
|
955
|
+
|
|
956
|
+
def _get_priority(self,
|
|
957
|
+
seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
|
|
958
|
+
"""Get the priority of the sequence group.
|
|
959
|
+
Highest preference to user-defined priority, followed by arrival time.
|
|
960
|
+
Args:
|
|
961
|
+
seq_group: The sequence group input.
|
|
962
|
+
Returns:
|
|
963
|
+
The priority of the sequence group.
|
|
964
|
+
"""
|
|
965
|
+
return seq_group.priority, seq_group.arrival_time
|
|
966
|
+
|
|
967
|
+
def _schedule_priority_preemption(
|
|
968
|
+
self,
|
|
969
|
+
budget: SchedulingBudget,
|
|
970
|
+
) -> int:
|
|
971
|
+
"""Sorts waiting and running queue. Also, force preempt requests
|
|
972
|
+
from the running queue if their priority is lower.
|
|
973
|
+
Priority-based preemption is used with the priority policy.
|
|
974
|
+
Args:
|
|
975
|
+
budget: The scheduling budget. The argument is in-place updated
|
|
976
|
+
when any requests are scheduled.
|
|
977
|
+
Returns:
|
|
978
|
+
A count of priority-based preemptions.
|
|
979
|
+
"""
|
|
980
|
+
|
|
981
|
+
waiting_queue = self.waiting
|
|
982
|
+
|
|
983
|
+
running_queue = deque(sorted(self.running, key=self._get_priority))
|
|
984
|
+
|
|
985
|
+
blocks_to_swap_out: List[Tuple[int, int]] = []
|
|
986
|
+
force_preemption_count = 0
|
|
987
|
+
|
|
988
|
+
if waiting_queue:
|
|
989
|
+
seq_group = waiting_queue.popleft()
|
|
990
|
+
num_new_seqs = seq_group.get_max_num_running_seqs()
|
|
991
|
+
num_new_tokens_uncached, _ = \
|
|
992
|
+
self._get_num_new_uncached_and_cached_tokens(
|
|
993
|
+
seq_group, SequenceStatus.WAITING, False, budget)
|
|
994
|
+
|
|
995
|
+
# Only preempt if priority inversion exists
|
|
996
|
+
while running_queue and self._get_priority(
|
|
997
|
+
running_queue[-1]) > self._get_priority(seq_group):
|
|
998
|
+
# Only preempt if waiting sequence cannot be allocated
|
|
999
|
+
can_allocate = self.block_manager.can_allocate(seq_group)
|
|
1000
|
+
if (num_new_tokens_uncached > 0
|
|
1001
|
+
and can_allocate == AllocStatus.OK
|
|
1002
|
+
and budget.can_schedule(
|
|
1003
|
+
num_new_tokens=num_new_tokens_uncached,
|
|
1004
|
+
num_new_seqs=num_new_seqs,
|
|
1005
|
+
)):
|
|
1006
|
+
break
|
|
1007
|
+
|
|
1008
|
+
# Adjust budget to remove the victim sequence group
|
|
1009
|
+
vseq_group = running_queue.pop()
|
|
1010
|
+
num_running_tokens_uncached, _ = (
|
|
1011
|
+
self._get_num_new_uncached_and_cached_tokens(
|
|
1012
|
+
vseq_group, SequenceStatus.RUNNING, False, budget))
|
|
1013
|
+
budget.subtract_num_batched_tokens(
|
|
1014
|
+
vseq_group.request_id, num_running_tokens_uncached)
|
|
1015
|
+
num_running_seqs = vseq_group.get_max_num_running_seqs()
|
|
1016
|
+
budget.subtract_num_seqs(vseq_group.request_id,
|
|
1017
|
+
num_running_seqs)
|
|
1018
|
+
|
|
1019
|
+
# Preempt out the victim sequence group
|
|
1020
|
+
self._preempt(vseq_group, blocks_to_swap_out)
|
|
1021
|
+
waiting_queue.appendleft(vseq_group)
|
|
1022
|
+
force_preemption_count += 1
|
|
1023
|
+
# Put the sequence back into the waiting queue
|
|
1024
|
+
waiting_queue.appendleft(seq_group)
|
|
1025
|
+
|
|
1026
|
+
waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
|
|
1027
|
+
|
|
1028
|
+
self.waiting = waiting_queue
|
|
1029
|
+
self.running = running_queue
|
|
1030
|
+
return force_preemption_count
|
|
1031
|
+
|
|
1032
|
+
def _schedule_prefills(
|
|
1033
|
+
self,
|
|
1034
|
+
budget: SchedulingBudget,
|
|
1035
|
+
curr_loras: Optional[Set[int]],
|
|
1036
|
+
enable_chunking: bool = False,
|
|
1037
|
+
partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
|
|
1038
|
+
) -> SchedulerPrefillOutputs:
|
|
1039
|
+
"""Schedule sequence groups that are in prefill stage.
|
|
1040
|
+
|
|
1041
|
+
Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
|
|
1042
|
+
as a new prefill (that starts from beginning -> most recently generated
|
|
1043
|
+
tokens).
|
|
1044
|
+
|
|
1045
|
+
It schedules waiting requests as long as it fits `budget` and
|
|
1046
|
+
curr_loras <= max_lora from the scheduling config. The input arguments
|
|
1047
|
+
`budget` and `curr_loras` are updated based on scheduled seq_groups.
|
|
1048
|
+
|
|
1049
|
+
Args:
|
|
1050
|
+
budget: The scheduling budget. The argument is in-place updated
|
|
1051
|
+
when any requests are scheduled.
|
|
1052
|
+
curr_loras: Currently batched lora request ids. The argument is
|
|
1053
|
+
in-place updated when any requests are scheduled.
|
|
1054
|
+
enable_chunking: If True, seq group can be chunked and only a
|
|
1055
|
+
chunked number of tokens are scheduled if
|
|
1056
|
+
`budget.num_batched_tokens` has not enough capacity to schedule
|
|
1057
|
+
all tokens.
|
|
1058
|
+
partial_prefill_metadata: information about the partial prefills
|
|
1059
|
+
that are currently running
|
|
1060
|
+
|
|
1061
|
+
Returns:
|
|
1062
|
+
SchedulerPrefillOutputs.
|
|
1063
|
+
"""
|
|
1064
|
+
if budget.remaining_token_budget() == 0:
|
|
1065
|
+
# Do nothing: Can't add any more prefill anyway
|
|
1066
|
+
return SchedulerPrefillOutputs(
|
|
1067
|
+
seq_groups=[],
|
|
1068
|
+
ignored_seq_groups=[],
|
|
1069
|
+
num_lookahead_slots=self._get_num_lookahead_slots(
|
|
1070
|
+
is_prefill=True, enable_chunking=enable_chunking),
|
|
1071
|
+
)
|
|
1072
|
+
ignored_seq_groups: List[SequenceGroup] = []
|
|
1073
|
+
seq_groups: List[ScheduledSequenceGroup] = []
|
|
1074
|
+
|
|
1075
|
+
waiting_queue = self.waiting
|
|
1076
|
+
|
|
1077
|
+
leftover_waiting_sequences: Deque[SequenceGroup] = deque()
|
|
1078
|
+
while self._passed_delay(time.time()) and waiting_queue:
|
|
1079
|
+
seq_group = waiting_queue[0]
|
|
1080
|
+
|
|
1081
|
+
waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
|
|
1082
|
+
assert len(waiting_seqs) == 1, (
|
|
1083
|
+
"Waiting sequence group should have only one prompt "
|
|
1084
|
+
"sequence.")
|
|
1085
|
+
if (partial_prefill_metadata is not None
|
|
1086
|
+
and not partial_prefill_metadata.can_schedule(seq_group)):
|
|
1087
|
+
leftover_waiting_sequences.appendleft(seq_group)
|
|
1088
|
+
waiting_queue.popleft()
|
|
1089
|
+
continue
|
|
1090
|
+
num_new_tokens_uncached, num_new_tokens_cached = (
|
|
1091
|
+
self._get_num_new_uncached_and_cached_tokens(
|
|
1092
|
+
seq_group,
|
|
1093
|
+
SequenceStatus.WAITING,
|
|
1094
|
+
enable_chunking,
|
|
1095
|
+
budget,
|
|
1096
|
+
partial_prefill_metadata=partial_prefill_metadata,
|
|
1097
|
+
))
|
|
1098
|
+
num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
|
|
1099
|
+
|
|
1100
|
+
if not enable_chunking:
|
|
1101
|
+
num_prompt_tokens = waiting_seqs[0].get_len()
|
|
1102
|
+
assert num_new_tokens == num_prompt_tokens
|
|
1103
|
+
|
|
1104
|
+
prompt_limit = self._get_prompt_limit(seq_group)
|
|
1105
|
+
if num_new_tokens > prompt_limit:
|
|
1106
|
+
logger.warning(
|
|
1107
|
+
"Input prompt (%d tokens) is too long"
|
|
1108
|
+
" and exceeds limit of %d",
|
|
1109
|
+
num_new_tokens,
|
|
1110
|
+
prompt_limit,
|
|
1111
|
+
)
|
|
1112
|
+
for seq in waiting_seqs:
|
|
1113
|
+
seq.status = SequenceStatus.FINISHED_IGNORED
|
|
1114
|
+
ignored_seq_groups.append(seq_group)
|
|
1115
|
+
waiting_queue.popleft()
|
|
1116
|
+
continue
|
|
1117
|
+
|
|
1118
|
+
num_lookahead_slots: int = 0
|
|
1119
|
+
if self.scheduler_config.is_multi_step and enable_chunking:
|
|
1120
|
+
num_lookahead_slots = self._get_num_lookahead_slots(
|
|
1121
|
+
True, enable_chunking)
|
|
1122
|
+
|
|
1123
|
+
# If the sequence group cannot be allocated, stop.
|
|
1124
|
+
can_allocate = self.block_manager.can_allocate(
|
|
1125
|
+
seq_group, num_lookahead_slots=num_lookahead_slots)
|
|
1126
|
+
if can_allocate == AllocStatus.LATER:
|
|
1127
|
+
break
|
|
1128
|
+
elif can_allocate == AllocStatus.NEVER:
|
|
1129
|
+
logger.warning(
|
|
1130
|
+
"Input prompt (%d tokens) + lookahead slots (%d) is "
|
|
1131
|
+
"too long and exceeds the capacity of block_manager",
|
|
1132
|
+
num_new_tokens,
|
|
1133
|
+
num_lookahead_slots,
|
|
1134
|
+
)
|
|
1135
|
+
for seq in waiting_seqs:
|
|
1136
|
+
seq.status = SequenceStatus.FINISHED_IGNORED
|
|
1137
|
+
ignored_seq_groups.append(seq_group)
|
|
1138
|
+
waiting_queue.popleft()
|
|
1139
|
+
continue
|
|
1140
|
+
|
|
1141
|
+
lora_int_id = 0
|
|
1142
|
+
if self.lora_enabled:
|
|
1143
|
+
lora_int_id = seq_group.lora_int_id
|
|
1144
|
+
assert curr_loras is not None
|
|
1145
|
+
assert self.lora_config is not None
|
|
1146
|
+
if (self.lora_enabled and lora_int_id > 0
|
|
1147
|
+
and lora_int_id not in curr_loras
|
|
1148
|
+
and len(curr_loras) >= self.lora_config.max_loras):
|
|
1149
|
+
# We don't have a space for another LoRA, so
|
|
1150
|
+
# we ignore this request for now.
|
|
1151
|
+
leftover_waiting_sequences.appendleft(seq_group)
|
|
1152
|
+
waiting_queue.popleft()
|
|
1153
|
+
continue
|
|
1154
|
+
|
|
1155
|
+
if (budget.num_batched_tokens
|
|
1156
|
+
>= self.scheduler_config.max_num_batched_tokens):
|
|
1157
|
+
# We've reached the budget limit - since there might be
|
|
1158
|
+
# continuous prefills in the running queue, we should break
|
|
1159
|
+
# to avoid scheduling any new prefills.
|
|
1160
|
+
break
|
|
1161
|
+
|
|
1162
|
+
num_new_seqs = seq_group.get_max_num_running_seqs()
|
|
1163
|
+
if num_new_tokens_uncached == 0 or not budget.can_schedule(
|
|
1164
|
+
num_new_tokens=num_new_tokens_uncached,
|
|
1165
|
+
num_new_seqs=num_new_seqs,
|
|
1166
|
+
):
|
|
1167
|
+
break
|
|
1168
|
+
|
|
1169
|
+
# Can schedule this request.
|
|
1170
|
+
if curr_loras is not None and lora_int_id > 0:
|
|
1171
|
+
curr_loras.add(lora_int_id)
|
|
1172
|
+
waiting_queue.popleft()
|
|
1173
|
+
self._allocate_and_set_running(seq_group)
|
|
1174
|
+
|
|
1175
|
+
if partial_prefill_metadata is not None:
|
|
1176
|
+
partial_prefill_metadata.maybe_increment_partial_prefills(
|
|
1177
|
+
seq_group)
|
|
1178
|
+
|
|
1179
|
+
if enable_chunking and self.scheduler_config.is_multi_step:
|
|
1180
|
+
blocks_to_copy: List[Tuple[int, int]] = []
|
|
1181
|
+
# init_multi_step_from_lookahead_slots happens in append_slots
|
|
1182
|
+
self._append_slots(seq_group, blocks_to_copy, enable_chunking)
|
|
1183
|
+
# This assert will trip when a copy-on-write happens. This is
|
|
1184
|
+
# not a concern as the very first sequence-group block
|
|
1185
|
+
# allocation happens above. Still, we have the assert to
|
|
1186
|
+
# catch any edge-cases.
|
|
1187
|
+
assert not blocks_to_copy
|
|
1188
|
+
else:
|
|
1189
|
+
seq_group.init_multi_step_from_lookahead_slots(
|
|
1190
|
+
num_lookahead_slots,
|
|
1191
|
+
num_scheduler_steps=self.scheduler_config.
|
|
1192
|
+
num_scheduler_steps,
|
|
1193
|
+
is_multi_step=self.scheduler_config.is_multi_step,
|
|
1194
|
+
enable_chunking=enable_chunking,
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
seq_groups.append(
|
|
1198
|
+
ScheduledSequenceGroup(seq_group=seq_group,
|
|
1199
|
+
token_chunk_size=num_new_tokens))
|
|
1200
|
+
budget.add_num_batched_tokens(
|
|
1201
|
+
seq_group.request_id,
|
|
1202
|
+
num_batched_tokens=num_new_tokens_uncached,
|
|
1203
|
+
num_cached_tokens=num_new_tokens_cached,
|
|
1204
|
+
)
|
|
1205
|
+
budget.add_num_seqs(seq_group.request_id, num_new_seqs)
|
|
1206
|
+
|
|
1207
|
+
# Queue requests that couldn't be scheduled.
|
|
1208
|
+
waiting_queue.extendleft(leftover_waiting_sequences)
|
|
1209
|
+
if len(seq_groups) > 0:
|
|
1210
|
+
self.prev_prompt = True
|
|
1211
|
+
|
|
1212
|
+
return SchedulerPrefillOutputs(
|
|
1213
|
+
seq_groups=seq_groups,
|
|
1214
|
+
ignored_seq_groups=ignored_seq_groups,
|
|
1215
|
+
num_lookahead_slots=self._get_num_lookahead_slots(
|
|
1216
|
+
is_prefill=True, enable_chunking=enable_chunking),
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
def _schedule_default(self) -> SchedulerOutputs:
|
|
1220
|
+
"""Schedule queued requests.
|
|
1221
|
+
|
|
1222
|
+
The current policy is designed to optimize the throughput. First,
|
|
1223
|
+
it batches as many prefill requests as possible. And it schedules
|
|
1224
|
+
decodes. If there's a pressure on GPU memory, decode requests can
|
|
1225
|
+
be swapped or preempted.
|
|
1226
|
+
"""
|
|
1227
|
+
# Include running requests to the budget.
|
|
1228
|
+
budget = SchedulingBudget(
|
|
1229
|
+
token_budget=self.scheduler_config.max_num_batched_tokens,
|
|
1230
|
+
max_num_seqs=self.scheduler_config.max_num_seqs,
|
|
1231
|
+
)
|
|
1232
|
+
# Make sure we include num running seqs before scheduling prefill,
|
|
1233
|
+
# so that we don't schedule beyond max_num_seqs for prefill.
|
|
1234
|
+
for seq_group in self.running:
|
|
1235
|
+
budget.add_num_seqs(seq_group.request_id,
|
|
1236
|
+
seq_group.get_max_num_running_seqs())
|
|
1237
|
+
curr_loras = (set(
|
|
1238
|
+
seq_group.lora_int_id for seq_group in self.running
|
|
1239
|
+
if seq_group.lora_int_id > 0) if self.lora_enabled else None)
|
|
1240
|
+
|
|
1241
|
+
prefills = SchedulerPrefillOutputs.create_empty()
|
|
1242
|
+
running_scheduled = SchedulerRunningOutputs.create_empty()
|
|
1243
|
+
swapped_in = SchedulerSwappedInOutputs.create_empty()
|
|
1244
|
+
|
|
1245
|
+
# If any requests are swapped, prioritized swapped requests.
|
|
1246
|
+
if not self.swapped:
|
|
1247
|
+
prefills = self._schedule_prefills(budget,
|
|
1248
|
+
curr_loras,
|
|
1249
|
+
enable_chunking=False)
|
|
1250
|
+
|
|
1251
|
+
if len(prefills.seq_groups
|
|
1252
|
+
) == 0 and self.scheduler_config.policy == "priority":
|
|
1253
|
+
self._schedule_priority_preemption(budget)
|
|
1254
|
+
|
|
1255
|
+
# Don't schedule decodes if prefills are scheduled.
|
|
1256
|
+
# NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
|
|
1257
|
+
# only contains decode requests, not chunked prefills.
|
|
1258
|
+
if len(prefills.seq_groups) == 0:
|
|
1259
|
+
running_scheduled = self._schedule_running(budget,
|
|
1260
|
+
curr_loras,
|
|
1261
|
+
enable_chunking=False)
|
|
1262
|
+
|
|
1263
|
+
# If any sequence group is preempted, do not swap in any sequence
|
|
1264
|
+
# group. because it means there's no slot for new running requests.
|
|
1265
|
+
if (len(running_scheduled.preempted) +
|
|
1266
|
+
len(running_scheduled.swapped_out) == 0):
|
|
1267
|
+
swapped_in = \
|
|
1268
|
+
self._schedule_swapped(budget, curr_loras)
|
|
1269
|
+
|
|
1270
|
+
assert (budget.num_batched_tokens
|
|
1271
|
+
<= self.scheduler_config.max_num_batched_tokens)
|
|
1272
|
+
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
|
|
1273
|
+
|
|
1274
|
+
# Update waiting requests.
|
|
1275
|
+
self.waiting.extendleft(running_scheduled.preempted)
|
|
1276
|
+
# Update new running requests.
|
|
1277
|
+
if len(prefills.seq_groups) > 0:
|
|
1278
|
+
self.running.extend([s.seq_group for s in prefills.seq_groups])
|
|
1279
|
+
|
|
1280
|
+
self.running.extend(running_scheduled.decode_seq_groups_list)
|
|
1281
|
+
|
|
1282
|
+
if len(swapped_in.decode_seq_groups) > 0:
|
|
1283
|
+
self.running.extend(
|
|
1284
|
+
[s.seq_group for s in swapped_in.decode_seq_groups])
|
|
1285
|
+
|
|
1286
|
+
# Update swapped requests.
|
|
1287
|
+
self.swapped.extend(running_scheduled.swapped_out)
|
|
1288
|
+
preempted = len(running_scheduled.preempted) + len(
|
|
1289
|
+
running_scheduled.swapped_out)
|
|
1290
|
+
|
|
1291
|
+
# There should be no prefill from running queue because this policy
|
|
1292
|
+
# doesn't allow chunked prefills.
|
|
1293
|
+
assert len(running_scheduled.prefill_seq_groups) == 0
|
|
1294
|
+
assert len(swapped_in.prefill_seq_groups) == 0
|
|
1295
|
+
|
|
1296
|
+
# Merge lists
|
|
1297
|
+
num_prefill_groups = len(prefills.seq_groups)
|
|
1298
|
+
if num_prefill_groups > 0:
|
|
1299
|
+
scheduled_seq_groups = prefills.seq_groups
|
|
1300
|
+
scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
|
|
1301
|
+
else:
|
|
1302
|
+
scheduled_seq_groups = running_scheduled.decode_seq_groups
|
|
1303
|
+
scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
|
|
1304
|
+
|
|
1305
|
+
blocks_to_copy = running_scheduled.blocks_to_copy
|
|
1306
|
+
blocks_to_copy.extend(swapped_in.blocks_to_copy)
|
|
1307
|
+
|
|
1308
|
+
ignored_seq_groups = prefills.ignored_seq_groups
|
|
1309
|
+
ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
|
|
1310
|
+
|
|
1311
|
+
return SchedulerOutputs(
|
|
1312
|
+
scheduled_seq_groups=scheduled_seq_groups,
|
|
1313
|
+
num_prefill_groups=num_prefill_groups,
|
|
1314
|
+
num_batched_tokens=budget.num_batched_tokens +
|
|
1315
|
+
budget.num_cached_tokens,
|
|
1316
|
+
blocks_to_swap_in=swapped_in.blocks_to_swap_in,
|
|
1317
|
+
blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
|
|
1318
|
+
blocks_to_copy=blocks_to_copy,
|
|
1319
|
+
ignored_seq_groups=ignored_seq_groups,
|
|
1320
|
+
num_lookahead_slots=running_scheduled.num_lookahead_slots,
|
|
1321
|
+
running_queue_size=len(self.running),
|
|
1322
|
+
preempted=preempted,
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
def _schedule_chunked_prefill(self) -> SchedulerOutputs:
|
|
1326
|
+
"""Schedule queued requests.
|
|
1327
|
+
|
|
1328
|
+
Chunked prefill allows to chunk prefill requests, batch them together
|
|
1329
|
+
with decode requests. This policy 1. schedule as many decoding requests
|
|
1330
|
+
as possible. 2. schedule chunked prefill requests that are not
|
|
1331
|
+
finished. 3. schedule swapped request. 4. schedule new prefill
|
|
1332
|
+
requests.
|
|
1333
|
+
|
|
1334
|
+
The policy can sustain the high GPU utilization because it can put
|
|
1335
|
+
prefill and decodes requests to the same batch, while it improves
|
|
1336
|
+
inter token latency because decodes requests don't need to be blocked
|
|
1337
|
+
by prefill requests.
|
|
1338
|
+
"""
|
|
1339
|
+
budget = SchedulingBudget(
|
|
1340
|
+
token_budget=self.scheduler_config.max_num_batched_tokens,
|
|
1341
|
+
max_num_seqs=self.scheduler_config.max_num_seqs,
|
|
1342
|
+
)
|
|
1343
|
+
curr_loras: Set[int] = set()
|
|
1344
|
+
|
|
1345
|
+
prefills = SchedulerPrefillOutputs.create_empty()
|
|
1346
|
+
swapped_in = SchedulerSwappedInOutputs.create_empty()
|
|
1347
|
+
|
|
1348
|
+
# Create partial prefill metadata
|
|
1349
|
+
partial_prefill_metadata = PartialPrefillMetadata.from_queues(
|
|
1350
|
+
running=self.running,
|
|
1351
|
+
waiting=self.waiting,
|
|
1352
|
+
scheduler_config=self.scheduler_config,
|
|
1353
|
+
)
|
|
1354
|
+
|
|
1355
|
+
# Decoding should be always scheduled first by fcfs.
|
|
1356
|
+
running_scheduled = self._schedule_running(
|
|
1357
|
+
budget,
|
|
1358
|
+
curr_loras,
|
|
1359
|
+
enable_chunking=True,
|
|
1360
|
+
partial_prefill_metadata=partial_prefill_metadata,
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
# Schedule swapped out requests.
|
|
1364
|
+
# If preemption happens, it means we don't have space for swap-in.
|
|
1365
|
+
if len(running_scheduled.preempted) + len(
|
|
1366
|
+
running_scheduled.swapped_out) == 0:
|
|
1367
|
+
swapped_in = self._schedule_swapped(budget, curr_loras)
|
|
1368
|
+
|
|
1369
|
+
prefills = self._schedule_prefills(
|
|
1370
|
+
budget,
|
|
1371
|
+
curr_loras,
|
|
1372
|
+
enable_chunking=True,
|
|
1373
|
+
partial_prefill_metadata=partial_prefill_metadata,
|
|
1374
|
+
)
|
|
1375
|
+
|
|
1376
|
+
assert (budget.num_batched_tokens
|
|
1377
|
+
<= self.scheduler_config.max_num_batched_tokens)
|
|
1378
|
+
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
|
|
1379
|
+
|
|
1380
|
+
# Update waiting requests.
|
|
1381
|
+
self.waiting.extendleft(running_scheduled.preempted)
|
|
1382
|
+
|
|
1383
|
+
# Update new running requests.
|
|
1384
|
+
# By default, vLLM scheduler prioritizes prefills.
|
|
1385
|
+
# Once chunked prefill is enabled,
|
|
1386
|
+
# the policy is changed to prioritize decode requests.
|
|
1387
|
+
self.running.extend(
|
|
1388
|
+
[s.seq_group for s in swapped_in.decode_seq_groups])
|
|
1389
|
+
self.running.extend(
|
|
1390
|
+
[s.seq_group for s in swapped_in.prefill_seq_groups])
|
|
1391
|
+
self.running.extend(
|
|
1392
|
+
[s.seq_group for s in running_scheduled.decode_seq_groups])
|
|
1393
|
+
# Because multiple prefills may be running concurrently, we need to
|
|
1394
|
+
# make sure that prefills which are scheduled to finish are listed
|
|
1395
|
+
# before those that won't. This is so that on the next scheduling
|
|
1396
|
+
# iteration when they have transitioned to the decode stage, they are
|
|
1397
|
+
# properly prioritized over sequences that are still in the prefill
|
|
1398
|
+
# stage.
|
|
1399
|
+
self.running.extend(
|
|
1400
|
+
self._order_finishing_prefills_first(
|
|
1401
|
+
running_scheduled.prefill_seq_groups))
|
|
1402
|
+
self.running.extend([s.seq_group for s in prefills.seq_groups])
|
|
1403
|
+
|
|
1404
|
+
# Update swapped requests.
|
|
1405
|
+
self.swapped.extend(running_scheduled.swapped_out)
|
|
1406
|
+
# Put prefills first due to Attention backend ordering assumption.
|
|
1407
|
+
scheduled_seq_groups = (prefills.seq_groups +
|
|
1408
|
+
running_scheduled.prefill_seq_groups +
|
|
1409
|
+
swapped_in.prefill_seq_groups +
|
|
1410
|
+
running_scheduled.decode_seq_groups +
|
|
1411
|
+
swapped_in.decode_seq_groups)
|
|
1412
|
+
num_prefill_groups = (len(prefills.seq_groups) +
|
|
1413
|
+
len(swapped_in.prefill_seq_groups) +
|
|
1414
|
+
len(running_scheduled.prefill_seq_groups))
|
|
1415
|
+
# If all prompts, then we set num_lookahead_slots to 0
|
|
1416
|
+
# this allows us to go through the `no_spec` path in
|
|
1417
|
+
# `spec_decode_worker.py`
|
|
1418
|
+
all_prefills = len(scheduled_seq_groups) == num_prefill_groups
|
|
1419
|
+
num_lookahead_slots = (0 if
|
|
1420
|
+
(all_prefills
|
|
1421
|
+
and not self.scheduler_config.is_multi_step)
|
|
1422
|
+
else running_scheduled.num_lookahead_slots)
|
|
1423
|
+
return SchedulerOutputs(
|
|
1424
|
+
scheduled_seq_groups=scheduled_seq_groups,
|
|
1425
|
+
num_prefill_groups=num_prefill_groups,
|
|
1426
|
+
num_batched_tokens=budget.num_batched_tokens +
|
|
1427
|
+
budget.num_cached_tokens,
|
|
1428
|
+
blocks_to_swap_in=swapped_in.blocks_to_swap_in,
|
|
1429
|
+
blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
|
|
1430
|
+
blocks_to_copy=running_scheduled.blocks_to_copy +
|
|
1431
|
+
swapped_in.blocks_to_copy,
|
|
1432
|
+
ignored_seq_groups=prefills.ignored_seq_groups +
|
|
1433
|
+
swapped_in.infeasible_seq_groups,
|
|
1434
|
+
num_lookahead_slots=num_lookahead_slots,
|
|
1435
|
+
running_queue_size=len(self.running),
|
|
1436
|
+
preempted=(len(running_scheduled.preempted) +
|
|
1437
|
+
len(running_scheduled.swapped_out)),
|
|
1438
|
+
)
|
|
1439
|
+
|
|
1440
|
+
def _order_finishing_prefills_first(
|
|
1441
|
+
self, scheduled_prefill_seqs: List[ScheduledSequenceGroup]
|
|
1442
|
+
) -> List[SequenceGroup]:
|
|
1443
|
+
"""Returns a list of prefilling SequenceGroups where sequences that are
|
|
1444
|
+
scheduled to finish prefilling are listed first"""
|
|
1445
|
+
finishing = [
|
|
1446
|
+
s.seq_group for s in scheduled_prefill_seqs
|
|
1447
|
+
if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size
|
|
1448
|
+
]
|
|
1449
|
+
not_finishing = [
|
|
1450
|
+
s.seq_group for s in scheduled_prefill_seqs
|
|
1451
|
+
if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size
|
|
1452
|
+
]
|
|
1453
|
+
return finishing + not_finishing
|
|
1454
|
+
|
|
1455
|
+
def _schedule(self) -> SchedulerOutputs:
|
|
1456
|
+
"""Schedule queued requests."""
|
|
1457
|
+
if self.scheduler_config.chunked_prefill_enabled:
|
|
1458
|
+
return self._schedule_chunked_prefill()
|
|
1459
|
+
else:
|
|
1460
|
+
return self._schedule_default()
|
|
1461
|
+
|
|
1462
|
+
def _can_append_slots(self, seq_group: SequenceGroup,
|
|
1463
|
+
enable_chunking: bool) -> bool:
|
|
1464
|
+
"""Determine whether or not we have enough space in the KV cache to
|
|
1465
|
+
continue generation of the sequence group.
|
|
1466
|
+
"""
|
|
1467
|
+
# It is True only for testing case to trigger artificial preemption.
|
|
1468
|
+
if (self.enable_artificial_preemption
|
|
1469
|
+
and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
|
|
1470
|
+
and self.artificial_preempt_cnt > 0):
|
|
1471
|
+
self.artificial_preempt_cnt -= 1
|
|
1472
|
+
return False
|
|
1473
|
+
|
|
1474
|
+
is_prefill = seq_group.is_prefill()
|
|
1475
|
+
num_lookahead_slots = self._get_num_lookahead_slots(
|
|
1476
|
+
is_prefill, enable_chunking)
|
|
1477
|
+
|
|
1478
|
+
if is_prefill and num_lookahead_slots > 0:
|
|
1479
|
+
# Appending prefill slots only happens multi-step and
|
|
1480
|
+
# chunked-prefill are enabled together.
|
|
1481
|
+
assert self.scheduler_config.is_multi_step and enable_chunking
|
|
1482
|
+
|
|
1483
|
+
return self.block_manager.can_append_slots(
|
|
1484
|
+
seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
|
|
1485
|
+
|
|
1486
|
+
def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
|
|
1487
|
+
# async_output_proc is allowed only when we have a single sequence
|
|
1488
|
+
# in the sequence group
|
|
1489
|
+
no_single_seq = seq_group.sampling_params is None or (
|
|
1490
|
+
seq_group.sampling_params.n == 1)
|
|
1491
|
+
return no_single_seq
|
|
1492
|
+
|
|
1493
|
+
def schedule(
|
|
1494
|
+
self
|
|
1495
|
+
) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
|
|
1496
|
+
# Schedule sequence groups.
|
|
1497
|
+
# This function call changes the internal states of the scheduler
|
|
1498
|
+
# such as self.running, self.swapped, and self.waiting.
|
|
1499
|
+
scheduler_start_time = time.perf_counter()
|
|
1500
|
+
|
|
1501
|
+
scheduler_outputs: SchedulerOutputs = self._schedule()
|
|
1502
|
+
now = time.time()
|
|
1503
|
+
|
|
1504
|
+
if not self.cache_config.enable_prefix_caching:
|
|
1505
|
+
common_computed_block_nums = []
|
|
1506
|
+
|
|
1507
|
+
allow_async_output_proc: bool = self.use_async_output_proc
|
|
1508
|
+
|
|
1509
|
+
# Create input data structures.
|
|
1510
|
+
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
|
1511
|
+
for i, scheduled_seq_group in enumerate(
|
|
1512
|
+
scheduler_outputs.scheduled_seq_groups):
|
|
1513
|
+
seq_group = scheduled_seq_group.seq_group
|
|
1514
|
+
token_chunk_size = scheduled_seq_group.token_chunk_size
|
|
1515
|
+
seq_group.maybe_set_first_scheduled_time(now)
|
|
1516
|
+
|
|
1517
|
+
seq_group_metadata = self._seq_group_metadata_cache[
|
|
1518
|
+
self.cache_id].get_object()
|
|
1519
|
+
seq_group_metadata.seq_data.clear()
|
|
1520
|
+
seq_group_metadata.block_tables.clear()
|
|
1521
|
+
|
|
1522
|
+
# seq_id -> SequenceData
|
|
1523
|
+
seq_data: Dict[int, SequenceData] = {}
|
|
1524
|
+
# seq_id -> physical block numbers
|
|
1525
|
+
block_tables: Dict[int, List[int]] = {}
|
|
1526
|
+
|
|
1527
|
+
if seq_group.is_encoder_decoder():
|
|
1528
|
+
# Encoder associated with SequenceGroup
|
|
1529
|
+
encoder_seq = seq_group.get_encoder_seq()
|
|
1530
|
+
assert encoder_seq is not None
|
|
1531
|
+
encoder_seq_data = encoder_seq.data
|
|
1532
|
+
# Block table for cross-attention
|
|
1533
|
+
# Also managed at SequenceGroup level
|
|
1534
|
+
cross_block_table = self.block_manager.get_cross_block_table(
|
|
1535
|
+
seq_group)
|
|
1536
|
+
else:
|
|
1537
|
+
encoder_seq_data = None
|
|
1538
|
+
cross_block_table = None
|
|
1539
|
+
|
|
1540
|
+
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
|
1541
|
+
seq_id = seq.seq_id
|
|
1542
|
+
seq_data[seq_id] = seq.data
|
|
1543
|
+
block_tables[seq_id] = self.block_manager.get_block_table(seq)
|
|
1544
|
+
self.block_manager.access_all_blocks_in_seq(seq, now)
|
|
1545
|
+
|
|
1546
|
+
if self.cache_config.enable_prefix_caching:
|
|
1547
|
+
common_computed_block_nums = (
|
|
1548
|
+
self.block_manager.get_common_computed_block_ids(
|
|
1549
|
+
seq_group.get_seqs(status=SequenceStatus.RUNNING)))
|
|
1550
|
+
|
|
1551
|
+
do_sample = True
|
|
1552
|
+
is_prompt = seq_group.is_prefill()
|
|
1553
|
+
# We should send the metadata to workers when the first prefill
|
|
1554
|
+
# is sent. Subsequent requests could be chunked prefill or decode.
|
|
1555
|
+
is_first_prefill = False
|
|
1556
|
+
if is_prompt:
|
|
1557
|
+
seqs = seq_group.get_seqs()
|
|
1558
|
+
# Prefill has only 1 sequence.
|
|
1559
|
+
assert len(seqs) == 1
|
|
1560
|
+
num_computed_tokens = seqs[0].data.get_num_computed_tokens()
|
|
1561
|
+
is_first_prefill = num_computed_tokens == 0
|
|
1562
|
+
# In the next iteration, all prompt tokens are not computed.
|
|
1563
|
+
# It means the prefill is chunked, and we don't need sampling.
|
|
1564
|
+
# NOTE: We use get_len instead of get_prompt_len because when
|
|
1565
|
+
# a sequence is preempted, prefill includes previous generated
|
|
1566
|
+
# output tokens.
|
|
1567
|
+
if (token_chunk_size + num_computed_tokens
|
|
1568
|
+
< seqs[0].data.get_len()):
|
|
1569
|
+
do_sample = False
|
|
1570
|
+
|
|
1571
|
+
# It assumes the scheduled_seq_groups is ordered by
|
|
1572
|
+
# prefill < decoding.
|
|
1573
|
+
if is_first_prefill or not self.scheduler_config.send_delta_data:
|
|
1574
|
+
seq_group_metadata = SequenceGroupMetadata(
|
|
1575
|
+
request_id=seq_group.request_id,
|
|
1576
|
+
is_prompt=is_prompt,
|
|
1577
|
+
seq_data=seq_data,
|
|
1578
|
+
sampling_params=seq_group.sampling_params,
|
|
1579
|
+
block_tables=block_tables,
|
|
1580
|
+
do_sample=do_sample,
|
|
1581
|
+
pooling_params=seq_group.pooling_params,
|
|
1582
|
+
token_chunk_size=token_chunk_size,
|
|
1583
|
+
lora_request=seq_group.lora_request,
|
|
1584
|
+
computed_block_nums=common_computed_block_nums,
|
|
1585
|
+
encoder_seq_data=encoder_seq_data,
|
|
1586
|
+
cross_block_table=cross_block_table,
|
|
1587
|
+
state=seq_group.state,
|
|
1588
|
+
token_type_ids=seq_group.token_type_ids,
|
|
1589
|
+
# `multi_modal_data` will only be present for the 1st comm
|
|
1590
|
+
# between engine and worker.
|
|
1591
|
+
# the subsequent comms can still use delta, but
|
|
1592
|
+
# `multi_modal_data` will be None.
|
|
1593
|
+
multi_modal_data=(seq_group.multi_modal_data
|
|
1594
|
+
if scheduler_outputs.num_prefill_groups
|
|
1595
|
+
> 0 else None),
|
|
1596
|
+
multi_modal_placeholders=(
|
|
1597
|
+
seq_group.multi_modal_placeholders
|
|
1598
|
+
if scheduler_outputs.num_prefill_groups > 0 else None),
|
|
1599
|
+
prompt_adapter_request=seq_group.prompt_adapter_request,
|
|
1600
|
+
)
|
|
1601
|
+
else:
|
|
1602
|
+
# When SPMD mode is enabled, we only send delta data except for
|
|
1603
|
+
# the first request to reduce serialization cost.
|
|
1604
|
+
seq_data_delta = {}
|
|
1605
|
+
for id, data in seq_data.items():
|
|
1606
|
+
seq_data_delta[id] = data.get_delta_and_reset()
|
|
1607
|
+
seq_group_metadata = SequenceGroupMetadataDelta(
|
|
1608
|
+
seq_data_delta,
|
|
1609
|
+
seq_group.request_id,
|
|
1610
|
+
block_tables,
|
|
1611
|
+
is_prompt,
|
|
1612
|
+
do_sample=do_sample,
|
|
1613
|
+
token_chunk_size=token_chunk_size,
|
|
1614
|
+
computed_block_nums=common_computed_block_nums,
|
|
1615
|
+
)
|
|
1616
|
+
seq_group_metadata_list.append(seq_group_metadata)
|
|
1617
|
+
|
|
1618
|
+
if allow_async_output_proc:
|
|
1619
|
+
allow_async_output_proc = self._allow_async_output_proc(
|
|
1620
|
+
seq_group)
|
|
1621
|
+
|
|
1622
|
+
# Now that the batch has been created, we can assume all blocks in the
|
|
1623
|
+
# batch will have been computed before the next scheduling invocation.
|
|
1624
|
+
# This is because the engine assumes that a failure in model execution
|
|
1625
|
+
# will crash the vLLM instance / will not retry.
|
|
1626
|
+
for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
|
|
1627
|
+
self.block_manager.mark_blocks_as_computed(
|
|
1628
|
+
scheduled_seq_group.seq_group,
|
|
1629
|
+
scheduled_seq_group.token_chunk_size)
|
|
1630
|
+
|
|
1631
|
+
self._seq_group_metadata_cache[self.next_cache_id].reset()
|
|
1632
|
+
|
|
1633
|
+
scheduler_time = time.perf_counter() - scheduler_start_time
|
|
1634
|
+
# Add this to scheduler time to all the sequences that are currently
|
|
1635
|
+
# running. This will help estimate if the scheduler is a significant
|
|
1636
|
+
# component in the e2e latency.
|
|
1637
|
+
for seq_group in self.running:
|
|
1638
|
+
if seq_group is not None and seq_group.metrics is not None:
|
|
1639
|
+
if seq_group.metrics.scheduler_time is not None:
|
|
1640
|
+
seq_group.metrics.scheduler_time += scheduler_time
|
|
1641
|
+
else:
|
|
1642
|
+
seq_group.metrics.scheduler_time = scheduler_time
|
|
1643
|
+
|
|
1644
|
+
# Move to next cache (if exists)
|
|
1645
|
+
self.cache_id = self.next_cache_id
|
|
1646
|
+
|
|
1647
|
+
# Return results
|
|
1648
|
+
return (seq_group_metadata_list, scheduler_outputs,
|
|
1649
|
+
allow_async_output_proc)
|
|
1650
|
+
|
|
1651
|
+
def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
|
1652
|
+
self.block_manager.fork(parent_seq, child_seq)
|
|
1653
|
+
|
|
1654
|
+
def free_seq(self, seq: Sequence) -> None:
|
|
1655
|
+
"""Free a sequence from a block table."""
|
|
1656
|
+
self.block_manager.free(seq)
|
|
1657
|
+
|
|
1658
|
+
def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
|
|
1659
|
+
"""Free finished seqs in a sequence group."""
|
|
1660
|
+
for seq in seq_group.get_seqs():
|
|
1661
|
+
if seq.is_finished():
|
|
1662
|
+
self.free_seq(seq)
|
|
1663
|
+
|
|
1664
|
+
def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
|
|
1665
|
+
if seq_group.is_finished():
|
|
1666
|
+
# Free cross-attention block table, if it exists
|
|
1667
|
+
self._free_seq_group_cross_attn_blocks(seq_group)
|
|
1668
|
+
|
|
1669
|
+
# Add the finished requests to the finished requests list.
|
|
1670
|
+
# This list will be used to update the Mamba cache in the
|
|
1671
|
+
# next step.
|
|
1672
|
+
self._finished_requests_ids.append(seq_group.request_id)
|
|
1673
|
+
|
|
1674
|
+
# Free finished seqs
|
|
1675
|
+
self._free_finished_seqs(seq_group)
|
|
1676
|
+
|
|
1677
|
+
def free_finished_seq_groups(self) -> None:
|
|
1678
|
+
remaining: Deque[SequenceGroup] = deque()
|
|
1679
|
+
for seq_group in self.running:
|
|
1680
|
+
self._free_finished_seq_group(seq_group)
|
|
1681
|
+
if not seq_group.is_finished():
|
|
1682
|
+
remaining.append(seq_group)
|
|
1683
|
+
|
|
1684
|
+
self.running = remaining
|
|
1685
|
+
|
|
1686
|
+
# Handle async stopped sequence groups
|
|
1687
|
+
# (ones that reached max model len)
|
|
1688
|
+
if self._async_stopped:
|
|
1689
|
+
for seq_group in self._async_stopped:
|
|
1690
|
+
self._free_seq_group_cross_attn_blocks(seq_group)
|
|
1691
|
+
self._finished_requests_ids.append(seq_group.request_id)
|
|
1692
|
+
|
|
1693
|
+
# Free finished seqs
|
|
1694
|
+
self._free_finished_seqs(seq_group)
|
|
1695
|
+
|
|
1696
|
+
self._async_stopped.clear()
|
|
1697
|
+
|
|
1698
|
+
def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
|
|
1699
|
+
self.block_manager.allocate(seq_group)
|
|
1700
|
+
for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
|
|
1701
|
+
seq.status = SequenceStatus.RUNNING
|
|
1702
|
+
|
|
1703
|
+
def _append_slots(
|
|
1704
|
+
self,
|
|
1705
|
+
seq_group: SequenceGroup,
|
|
1706
|
+
blocks_to_copy: List[Tuple[int, int]],
|
|
1707
|
+
enable_chunking: bool = False,
|
|
1708
|
+
) -> None:
|
|
1709
|
+
"""Appends new slots to the sequences in the given sequence group.
|
|
1710
|
+
|
|
1711
|
+
Args:
|
|
1712
|
+
seq_group (SequenceGroup): The sequence group containing the
|
|
1713
|
+
sequences to append slots to.
|
|
1714
|
+
blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
|
|
1715
|
+
ints, the first int is the source block index, and the second
|
|
1716
|
+
int is the destination block index. This list is updated with
|
|
1717
|
+
the new source and destination block indices for the appended
|
|
1718
|
+
slots.
|
|
1719
|
+
enable_chunking (bool): True if chunked prefill is enabled.
|
|
1720
|
+
"""
|
|
1721
|
+
is_prefill: bool = seq_group.is_prefill()
|
|
1722
|
+
num_lookahead_slots: int = self._get_num_lookahead_slots(
|
|
1723
|
+
is_prefill, enable_chunking)
|
|
1724
|
+
|
|
1725
|
+
seq_group.init_multi_step_from_lookahead_slots(
|
|
1726
|
+
num_lookahead_slots,
|
|
1727
|
+
num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
|
|
1728
|
+
is_multi_step=self.scheduler_config.is_multi_step,
|
|
1729
|
+
enable_chunking=enable_chunking,
|
|
1730
|
+
)
|
|
1731
|
+
|
|
1732
|
+
seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
|
|
1733
|
+
if self.scheduler_config.is_multi_step and enable_chunking:
|
|
1734
|
+
# In multi-step chunked-prefill any sequence type can have
|
|
1735
|
+
# slots appended.
|
|
1736
|
+
seq_status = None
|
|
1737
|
+
|
|
1738
|
+
for seq in seq_group.get_seqs(status=seq_status):
|
|
1739
|
+
cows = self.block_manager.append_slots(seq, num_lookahead_slots)
|
|
1740
|
+
if len(cows) > 0:
|
|
1741
|
+
blocks_to_copy.extend(cows)
|
|
1742
|
+
|
|
1743
|
+
def _preempt(self, seq_group: SequenceGroup,
|
|
1744
|
+
blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
|
|
1745
|
+
# If preemption mode is not specified, we determine the mode as follows:
|
|
1746
|
+
# We use recomputation by default since it incurs lower overhead than
|
|
1747
|
+
# swapping. However, when the sequence group has multiple sequences
|
|
1748
|
+
# (e.g., beam search), recomputation is not currently supported. In
|
|
1749
|
+
# such a case, we use swapping instead.
|
|
1750
|
+
# FIXME(woosuk): This makes our scheduling policy a bit bizarre.
|
|
1751
|
+
# As swapped sequences are prioritized over waiting sequences,
|
|
1752
|
+
# sequence groups with multiple sequences are implicitly prioritized
|
|
1753
|
+
# over sequence groups with a single sequence.
|
|
1754
|
+
# TODO(woosuk): Support recomputation for sequence groups with multiple
|
|
1755
|
+
# sequences. This may require a more sophisticated CUDA kernel.
|
|
1756
|
+
if self.user_specified_preemption_mode is None:
|
|
1757
|
+
if seq_group.get_max_num_running_seqs() == 1:
|
|
1758
|
+
preemption_mode = PreemptionMode.RECOMPUTE
|
|
1759
|
+
else:
|
|
1760
|
+
preemption_mode = PreemptionMode.SWAP
|
|
1761
|
+
|
|
1762
|
+
elif self.user_specified_preemption_mode == "swap":
|
|
1763
|
+
preemption_mode = PreemptionMode.SWAP
|
|
1764
|
+
else:
|
|
1765
|
+
preemption_mode = PreemptionMode.RECOMPUTE
|
|
1766
|
+
|
|
1767
|
+
if self.num_cumulative_preemption % 50 == 0:
|
|
1768
|
+
logger.warning(
|
|
1769
|
+
"Sequence group %s is preempted by %s mode because there is "
|
|
1770
|
+
"not enough KV cache space. This can affect the end-to-end "
|
|
1771
|
+
"performance. Increase gpu_memory_utilization or "
|
|
1772
|
+
"tensor_parallel_size to provide more KV cache memory. "
|
|
1773
|
+
"total_num_cumulative_preemption=%d",
|
|
1774
|
+
seq_group.request_id,
|
|
1775
|
+
preemption_mode,
|
|
1776
|
+
self.num_cumulative_preemption + 1,
|
|
1777
|
+
)
|
|
1778
|
+
self.num_cumulative_preemption += 1
|
|
1779
|
+
|
|
1780
|
+
if preemption_mode == PreemptionMode.RECOMPUTE:
|
|
1781
|
+
self._preempt_by_recompute(seq_group)
|
|
1782
|
+
elif preemption_mode == PreemptionMode.SWAP:
|
|
1783
|
+
self._preempt_by_swap(seq_group, blocks_to_swap_out)
|
|
1784
|
+
else:
|
|
1785
|
+
raise AssertionError("Invalid preemption mode.")
|
|
1786
|
+
return preemption_mode
|
|
1787
|
+
|
|
1788
|
+
def _preempt_by_recompute(
|
|
1789
|
+
self,
|
|
1790
|
+
seq_group: SequenceGroup,
|
|
1791
|
+
) -> None:
|
|
1792
|
+
seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
|
|
1793
|
+
assert len(seqs) == 1
|
|
1794
|
+
for seq in seqs:
|
|
1795
|
+
seq.status = SequenceStatus.WAITING
|
|
1796
|
+
self.free_seq(seq)
|
|
1797
|
+
seq.reset_state_for_recompute()
|
|
1798
|
+
self._free_seq_group_cross_attn_blocks(seq_group)
|
|
1799
|
+
|
|
1800
|
+
def _preempt_by_swap(
|
|
1801
|
+
self,
|
|
1802
|
+
seq_group: SequenceGroup,
|
|
1803
|
+
blocks_to_swap_out: List[Tuple[int, int]],
|
|
1804
|
+
) -> None:
|
|
1805
|
+
self._swap_out(seq_group, blocks_to_swap_out)
|
|
1806
|
+
|
|
1807
|
+
def _swap_in(
|
|
1808
|
+
self,
|
|
1809
|
+
seq_group: SequenceGroup,
|
|
1810
|
+
blocks_to_swap_in: List[Tuple[int, int]],
|
|
1811
|
+
) -> None:
|
|
1812
|
+
mapping = self.block_manager.swap_in(seq_group)
|
|
1813
|
+
blocks_to_swap_in.extend(mapping)
|
|
1814
|
+
for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
|
|
1815
|
+
seq.status = SequenceStatus.RUNNING
|
|
1816
|
+
|
|
1817
|
+
def _swap_out(
|
|
1818
|
+
self,
|
|
1819
|
+
seq_group: SequenceGroup,
|
|
1820
|
+
blocks_to_swap_out: List[Tuple[int, int]],
|
|
1821
|
+
) -> None:
|
|
1822
|
+
if not self.block_manager.can_swap_out(seq_group):
|
|
1823
|
+
# FIXME(woosuk): Abort the sequence group instead of aborting the
|
|
1824
|
+
# entire engine.
|
|
1825
|
+
raise RuntimeError(
|
|
1826
|
+
"Aborted due to the lack of CPU swap space. Please increase "
|
|
1827
|
+
"the swap space to avoid this error.")
|
|
1828
|
+
mapping = self.block_manager.swap_out(seq_group)
|
|
1829
|
+
blocks_to_swap_out.extend(mapping)
|
|
1830
|
+
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
|
1831
|
+
seq.status = SequenceStatus.SWAPPED
|
|
1832
|
+
|
|
1833
|
+
def _passed_delay(self, now: float) -> bool:
|
|
1834
|
+
if self.prev_prompt:
|
|
1835
|
+
self.last_prompt_latency = now - self.prev_time
|
|
1836
|
+
self.prev_time, self.prev_prompt = now, False
|
|
1837
|
+
# Delay scheduling prompts to let waiting queue fill up
|
|
1838
|
+
if self.scheduler_config.delay_factor > 0 and self.waiting:
|
|
1839
|
+
earliest_arrival_time = min(
|
|
1840
|
+
[e.metrics.arrival_time for e in self.waiting])
|
|
1841
|
+
passed_delay = ((now - earliest_arrival_time)
|
|
1842
|
+
> (self.scheduler_config.delay_factor *
|
|
1843
|
+
self.last_prompt_latency) or not self.running)
|
|
1844
|
+
else:
|
|
1845
|
+
passed_delay = True
|
|
1846
|
+
return passed_delay
|
|
1847
|
+
|
|
1848
|
+
def _get_num_lookahead_slots(self, is_prefill: bool,
|
|
1849
|
+
enable_chunking: bool) -> int:
|
|
1850
|
+
"""The number of slots to allocate per sequence per step, beyond known
|
|
1851
|
+
token ids. Speculative decoding uses these slots to store KV activations
|
|
1852
|
+
of tokens which may or may not be accepted.
|
|
1853
|
+
|
|
1854
|
+
Speculative decoding does not yet support prefill, so we do not perform
|
|
1855
|
+
lookahead allocation for prefill.
|
|
1856
|
+
|
|
1857
|
+
When chunking is enabled with multi-step, we allocate lookahead slots
|
|
1858
|
+
for the prefills for when the prefills turn into decodes in the first
|
|
1859
|
+
step.
|
|
1860
|
+
"""
|
|
1861
|
+
if is_prefill:
|
|
1862
|
+
if self.scheduler_config.is_multi_step and enable_chunking:
|
|
1863
|
+
# num_lookahead_slots was introduced in the context of decodes,
|
|
1864
|
+
# in Speculative Decoding.
|
|
1865
|
+
# When the num_scheduler_steps is 8, say, then the
|
|
1866
|
+
# num_lookahead_slots is 7. Meaning, we are doing a 1-step of
|
|
1867
|
+
# decode anyways and we wish to do 7 more.
|
|
1868
|
+
#
|
|
1869
|
+
# "lookaheads" for prefills, is introduced in support for
|
|
1870
|
+
# Chunked-Prefill in Multi-Step.
|
|
1871
|
+
return self.scheduler_config.num_lookahead_slots + 1
|
|
1872
|
+
else:
|
|
1873
|
+
return 0
|
|
1874
|
+
|
|
1875
|
+
return self.scheduler_config.num_lookahead_slots
|
|
1876
|
+
|
|
1877
|
+
def _get_num_new_uncached_and_cached_tokens(
|
|
1878
|
+
self,
|
|
1879
|
+
seq_group: SequenceGroup,
|
|
1880
|
+
status: SequenceStatus,
|
|
1881
|
+
enable_chunking: bool,
|
|
1882
|
+
budget: SchedulingBudget,
|
|
1883
|
+
partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
|
|
1884
|
+
) -> Tuple[int, int]:
|
|
1885
|
+
"""
|
|
1886
|
+
Returns the number of new uncached and cached tokens to schedule for a
|
|
1887
|
+
given sequence group that's in a given `status`.
|
|
1888
|
+
|
|
1889
|
+
The API could chunk the number of tokens to compute based on `budget`
|
|
1890
|
+
if `enable_chunking` is True. If a sequence group has multiple
|
|
1891
|
+
sequences (e.g., running beam search), it means it is in decoding
|
|
1892
|
+
phase, so chunking doesn't happen.
|
|
1893
|
+
|
|
1894
|
+
Returns (0, 0) if the new token cannot be computed due to token budget.
|
|
1895
|
+
|
|
1896
|
+
The cached tokens's blocks are already computed, and the attention
|
|
1897
|
+
backend will reuse the cached blocks rather than recomputing them. So
|
|
1898
|
+
the scheduler could schedule these cached tokens "for free".
|
|
1899
|
+
|
|
1900
|
+
Args:
|
|
1901
|
+
seq_group: The sequence group to get the number of new tokens to
|
|
1902
|
+
schedule.
|
|
1903
|
+
status: The status of the sequences to get the number of new tokens
|
|
1904
|
+
to schedule.
|
|
1905
|
+
enable_chunking: Whether to chunk the number of tokens to compute.
|
|
1906
|
+
budget: The budget to chunk the number of tokens to compute.
|
|
1907
|
+
partial_prefill_metadata: information about the partial prefills
|
|
1908
|
+
that are currently running
|
|
1909
|
+
|
|
1910
|
+
|
|
1911
|
+
Returns:
|
|
1912
|
+
A tuple of two ints. The first int is the number of new uncached
|
|
1913
|
+
tokens to schedule. The second int is the number of cached tokens.
|
|
1914
|
+
If no more new tokens can be scheduled, returns (0, 0).
|
|
1915
|
+
"""
|
|
1916
|
+
num_cached_new_tokens = 0
|
|
1917
|
+
num_uncached_new_tokens = 0
|
|
1918
|
+
|
|
1919
|
+
seqs = seq_group.get_seqs(status=status)
|
|
1920
|
+
# Compute the number of new uncached and cached tokens for
|
|
1921
|
+
# each sequence.
|
|
1922
|
+
for seq in seqs:
|
|
1923
|
+
if not seq.is_prefill():
|
|
1924
|
+
# Decode sequences should always just have 1 uncached token
|
|
1925
|
+
# TODO(rickyx): Actually is this still correct for multi-step?
|
|
1926
|
+
num_uncached_new_tokens += 1
|
|
1927
|
+
continue
|
|
1928
|
+
|
|
1929
|
+
num_computed_tokens_seq = seq.get_num_computed_tokens()
|
|
1930
|
+
all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
|
|
1931
|
+
if not self.cache_config.enable_prefix_caching:
|
|
1932
|
+
# If prefix caching is not enabled, all new tokens are uncached.
|
|
1933
|
+
num_uncached_new_tokens += all_num_new_tokens_seq
|
|
1934
|
+
continue
|
|
1935
|
+
|
|
1936
|
+
# NOTE: the cache token might be currently in a block that's in an
|
|
1937
|
+
# evictor meaning that it's not yet allocated. However, we don't
|
|
1938
|
+
# exclude such tokens in the cache count because it will be
|
|
1939
|
+
# guaranteed to be allocated later if the sequence can be allocated.
|
|
1940
|
+
num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
|
|
1941
|
+
seq)
|
|
1942
|
+
|
|
1943
|
+
# Sanity check.
|
|
1944
|
+
if num_cached_tokens_seq < num_computed_tokens_seq:
|
|
1945
|
+
# This should only happen with chunked prefill, and
|
|
1946
|
+
# the seq is still in prefill. The `num_cached_tokens_seq`
|
|
1947
|
+
# is the value we calculated on scheduling the first prefill.
|
|
1948
|
+
# For subsequent continuous prefill steps, we cached the
|
|
1949
|
+
# number of cache tokens for the sequence so the cached token
|
|
1950
|
+
# count could be less than the number of computed tokens.
|
|
1951
|
+
# See comments on `ComputedBlocksTracker` for more details.
|
|
1952
|
+
assert (
|
|
1953
|
+
seq.is_prefill() and seq.status == SequenceStatus.RUNNING
|
|
1954
|
+
and self.scheduler_config.chunked_prefill_enabled
|
|
1955
|
+
), ("Number of cached tokens should not be less than the "
|
|
1956
|
+
"number of computed tokens for a sequence that's still "
|
|
1957
|
+
f"in prefill. But there are {num_cached_tokens_seq} cached "
|
|
1958
|
+
f"tokens and {num_computed_tokens_seq} computed tokens "
|
|
1959
|
+
f"for sequence {seq.seq_id}.")
|
|
1960
|
+
|
|
1961
|
+
num_cached_new_tokens_seq = max(
|
|
1962
|
+
0, num_cached_tokens_seq - num_computed_tokens_seq)
|
|
1963
|
+
num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
|
|
1964
|
+
num_cached_new_tokens_seq)
|
|
1965
|
+
|
|
1966
|
+
num_uncached_new_tokens += num_uncached_new_tokens_seq
|
|
1967
|
+
num_cached_new_tokens += num_cached_new_tokens_seq
|
|
1968
|
+
|
|
1969
|
+
if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
|
|
1970
|
+
# For a fully cached hit sequence, we actually need to recompute the
|
|
1971
|
+
# last token. So we need at least 1 uncached token to schedule.
|
|
1972
|
+
# See ModelRunner._compute_for_prefix_cache_hit for more details.
|
|
1973
|
+
num_uncached_new_tokens = 1
|
|
1974
|
+
num_cached_new_tokens -= 1
|
|
1975
|
+
|
|
1976
|
+
if enable_chunking and len(seqs) == 1:
|
|
1977
|
+
# Chunk if a running request cannot fit in the given budget.
|
|
1978
|
+
# If number of seq > 1, it means it is doing beam search
|
|
1979
|
+
# in a decode phase. Do not chunk.
|
|
1980
|
+
num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
|
|
1981
|
+
self.scheduler_config,
|
|
1982
|
+
self.cache_config,
|
|
1983
|
+
budget,
|
|
1984
|
+
self._get_prompt_limit(seq_group),
|
|
1985
|
+
num_uncached_new_tokens,
|
|
1986
|
+
self.partial_prefill_budget_lookup_list,
|
|
1987
|
+
partial_prefill_metadata,
|
|
1988
|
+
)
|
|
1989
|
+
|
|
1990
|
+
return num_uncached_new_tokens, num_cached_new_tokens
|
|
1991
|
+
|
|
1992
|
+
@staticmethod
|
|
1993
|
+
def _chunk_new_tokens_to_schedule(
|
|
1994
|
+
scheduler_config: SchedulerConfig,
|
|
1995
|
+
cache_config: CacheConfig,
|
|
1996
|
+
budget: SchedulingBudget,
|
|
1997
|
+
prompt_limit: int,
|
|
1998
|
+
num_new_tokens: int,
|
|
1999
|
+
partial_prefill_budget_lookup_list: List[int],
|
|
2000
|
+
partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
|
|
2001
|
+
) -> int:
|
|
2002
|
+
"""
|
|
2003
|
+
Chunks the number of new tokens to schedule based on the budget when
|
|
2004
|
+
chunked prefill is enabled.
|
|
2005
|
+
|
|
2006
|
+
Args:
|
|
2007
|
+
scheduler_config: The scheduler config.
|
|
2008
|
+
cache_config: The cache config.
|
|
2009
|
+
budget: The budget to chunk the number of tokens to compute.
|
|
2010
|
+
prompt_limit: The maximum number of tokens allowed in a prompt.
|
|
2011
|
+
num_new_tokens: The number of new tokens to schedule.
|
|
2012
|
+
|
|
2013
|
+
Returns:
|
|
2014
|
+
The number of new tokens to schedule after chunking.
|
|
2015
|
+
"""
|
|
2016
|
+
remaining_token_budget = budget.remaining_token_budget()
|
|
2017
|
+
if scheduler_config.is_multi_step:
|
|
2018
|
+
# The current multi-step + chunked prefill capability does
|
|
2019
|
+
# not actually support chunking prompts.
|
|
2020
|
+
#
|
|
2021
|
+
# Therefore, `num_new_tokens` is computed in the same fashion
|
|
2022
|
+
# for both multi-step+chunked-prefill &
|
|
2023
|
+
# multi-step+chunked-prefill+APC
|
|
2024
|
+
#
|
|
2025
|
+
# Prompts with more tokens than the current remaining budget
|
|
2026
|
+
# are postponed to future scheduler steps
|
|
2027
|
+
if num_new_tokens > prompt_limit:
|
|
2028
|
+
# If the seq_group is in prompt-stage, pass the
|
|
2029
|
+
# num_new_tokens as-is so the caller can ignore
|
|
2030
|
+
# the sequence.
|
|
2031
|
+
return num_new_tokens
|
|
2032
|
+
|
|
2033
|
+
return 0 if num_new_tokens > \
|
|
2034
|
+
remaining_token_budget else num_new_tokens
|
|
2035
|
+
|
|
2036
|
+
# Get the number of tokens to allocate to this prefill slot
|
|
2037
|
+
prefill_slot_budget = (
|
|
2038
|
+
remaining_token_budget if partial_prefill_metadata is None else
|
|
2039
|
+
partial_prefill_budget_lookup_list[
|
|
2040
|
+
partial_prefill_metadata.schedulable_prefills])
|
|
2041
|
+
|
|
2042
|
+
if cache_config.enable_prefix_caching:
|
|
2043
|
+
# When prefix caching is enabled and we're partially prefilling
|
|
2044
|
+
# a sequence, we always allocate a number of new tokens that is
|
|
2045
|
+
# divisible by the block size to avoid partial block matching.
|
|
2046
|
+
block_size = cache_config.block_size
|
|
2047
|
+
# Don't exceed either the total budget or slot budget.
|
|
2048
|
+
# Take min of those and get the next lowest multiple of the
|
|
2049
|
+
# block size:
|
|
2050
|
+
remaining_token_budget = (
|
|
2051
|
+
min(remaining_token_budget, prefill_slot_budget) //
|
|
2052
|
+
block_size) * block_size
|
|
2053
|
+
# NB: In the case where num_new_tokens < budget, we are
|
|
2054
|
+
# finishing prefill for this sequence, so we do not need to
|
|
2055
|
+
# allocate a full block.
|
|
2056
|
+
|
|
2057
|
+
num_new_tokens = min(num_new_tokens, remaining_token_budget,
|
|
2058
|
+
prefill_slot_budget)
|
|
2059
|
+
|
|
2060
|
+
return num_new_tokens
|