vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm/_C.abi3.so +0 -0
- vllm/__init__.py +53 -0
- vllm/_custom_ops.py +1828 -0
- vllm/_ipex_ops.py +244 -0
- vllm/_version.py +34 -0
- vllm/adapter_commons/__init__.py +0 -0
- vllm/adapter_commons/layers.py +16 -0
- vllm/adapter_commons/models.py +106 -0
- vllm/adapter_commons/request.py +26 -0
- vllm/adapter_commons/utils.py +93 -0
- vllm/adapter_commons/worker_manager.py +39 -0
- vllm/assets/__init__.py +0 -0
- vllm/assets/audio.py +45 -0
- vllm/assets/base.py +41 -0
- vllm/assets/image.py +34 -0
- vllm/assets/video.py +115 -0
- vllm/attention/__init__.py +20 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +308 -0
- vllm/attention/backends/blocksparse_attn.py +461 -0
- vllm/attention/backends/cpu_mla.py +307 -0
- vllm/attention/backends/dual_chunk_flash_attn.py +1498 -0
- vllm/attention/backends/flash_attn.py +1003 -0
- vllm/attention/backends/flashinfer.py +1104 -0
- vllm/attention/backends/flashmla.py +244 -0
- vllm/attention/backends/hpu_attn.py +313 -0
- vllm/attention/backends/ipex_attn.py +398 -0
- vllm/attention/backends/mla/__init__.py +0 -0
- vllm/attention/backends/mla/common.py +1385 -0
- vllm/attention/backends/pallas.py +351 -0
- vllm/attention/backends/placeholder_attn.py +400 -0
- vllm/attention/backends/rocm_aiter_mla.py +435 -0
- vllm/attention/backends/rocm_flash_attn.py +975 -0
- vllm/attention/backends/torch_sdpa.py +703 -0
- vllm/attention/backends/triton_mla.py +115 -0
- vllm/attention/backends/utils.py +610 -0
- vllm/attention/backends/xformers.py +802 -0
- vllm/attention/layer.py +468 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
- vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
- vllm/attention/ops/blocksparse_attention/interface.py +239 -0
- vllm/attention/ops/blocksparse_attention/utils.py +246 -0
- vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
- vllm/attention/ops/flashmla.py +116 -0
- vllm/attention/ops/hpu_paged_attn.py +88 -0
- vllm/attention/ops/ipex_attn.py +195 -0
- vllm/attention/ops/merge_attn_states.py +43 -0
- vllm/attention/ops/nki_flash_attn.py +906 -0
- vllm/attention/ops/paged_attn.py +256 -0
- vllm/attention/ops/prefix_prefill.py +902 -0
- vllm/attention/ops/rocm_aiter_mla.py +100 -0
- vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
- vllm/attention/ops/triton_decode_attention.py +674 -0
- vllm/attention/ops/triton_flash_attention.py +979 -0
- vllm/attention/ops/triton_merge_attn_states.py +97 -0
- vllm/attention/ops/triton_unified_attention.py +334 -0
- vllm/attention/selector.py +187 -0
- vllm/attention/utils/fa_utils.py +55 -0
- vllm/beam_search.py +87 -0
- vllm/benchmarks/__init__.py +0 -0
- vllm/benchmarks/datasets.py +1185 -0
- vllm/benchmarks/endpoint_request_func.py +381 -0
- vllm/benchmarks/latency.py +168 -0
- vllm/benchmarks/serve.py +1135 -0
- vllm/benchmarks/throughput.py +609 -0
- vllm/benchmarks/utils.py +70 -0
- vllm/collect_env.py +820 -0
- vllm/compilation/__init__.py +0 -0
- vllm/compilation/activation_quant_fusion.py +89 -0
- vllm/compilation/backends.py +563 -0
- vllm/compilation/base_piecewise_backend.py +72 -0
- vllm/compilation/collective_fusion.py +127 -0
- vllm/compilation/compiler_interface.py +544 -0
- vllm/compilation/counter.py +38 -0
- vllm/compilation/cuda_piecewise_backend.py +214 -0
- vllm/compilation/decorators.py +250 -0
- vllm/compilation/fix_functionalization.py +191 -0
- vllm/compilation/fusion.py +618 -0
- vllm/compilation/fx_utils.py +62 -0
- vllm/compilation/inductor_pass.py +115 -0
- vllm/compilation/monitor.py +39 -0
- vllm/compilation/multi_output_match.py +109 -0
- vllm/compilation/noop_elimination.py +137 -0
- vllm/compilation/pass_manager.py +78 -0
- vllm/compilation/sequence_parallelism.py +268 -0
- vllm/compilation/torch25_custom_graph_pass.py +42 -0
- vllm/compilation/vllm_inductor_pass.py +67 -0
- vllm/compilation/wrapper.py +135 -0
- vllm/config.py +4746 -0
- vllm/connections.py +174 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +399 -0
- vllm/core/block/common.py +371 -0
- vllm/core/block/cpu_gpu_block_allocator.py +441 -0
- vllm/core/block/interfaces.py +319 -0
- vllm/core/block/naive_block.py +466 -0
- vllm/core/block/prefix_caching_block.py +1135 -0
- vllm/core/block/utils.py +28 -0
- vllm/core/block_manager.py +521 -0
- vllm/core/evictor.py +157 -0
- vllm/core/interfaces.py +135 -0
- vllm/core/placeholder_block_space_manager.py +100 -0
- vllm/core/scheduler.py +2093 -0
- vllm/device_allocator/__init__.py +0 -0
- vllm/device_allocator/cumem.py +281 -0
- vllm/distributed/__init__.py +6 -0
- vllm/distributed/communication_op.py +41 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/all2all.py +264 -0
- vllm/distributed/device_communicators/base_device_communicator.py +260 -0
- vllm/distributed/device_communicators/cpu_communicator.py +145 -0
- vllm/distributed/device_communicators/cuda_communicator.py +176 -0
- vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
- vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
- vllm/distributed/device_communicators/hpu_communicator.py +46 -0
- vllm/distributed/device_communicators/neuron_communicator.py +20 -0
- vllm/distributed/device_communicators/pynccl.py +218 -0
- vllm/distributed/device_communicators/pynccl_wrapper.py +341 -0
- vllm/distributed/device_communicators/shm_broadcast.py +585 -0
- vllm/distributed/device_communicators/tpu_communicator.py +103 -0
- vllm/distributed/device_communicators/xpu_communicator.py +55 -0
- vllm/distributed/kv_events.py +356 -0
- vllm/distributed/kv_transfer/README.md +29 -0
- vllm/distributed/kv_transfer/__init__.py +12 -0
- vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
- vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
- vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
- vllm/distributed/kv_transfer/kv_connector/factory.py +128 -0
- vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
- vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
- vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
- vllm/distributed/kv_transfer/kv_connector/utils.py +108 -0
- vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
- vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
- vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +134 -0
- vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1030 -0
- vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +384 -0
- vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
- vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
- vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
- vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
- vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
- vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
- vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
- vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +280 -0
- vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
- vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
- vllm/distributed/parallel_state.py +1296 -0
- vllm/distributed/tpu_distributed_utils.py +177 -0
- vllm/distributed/utils.py +536 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +1708 -0
- vllm/engine/async_llm_engine.py +1200 -0
- vllm/engine/async_timeout.py +173 -0
- vllm/engine/llm_engine.py +2097 -0
- vllm/engine/metrics.py +629 -0
- vllm/engine/metrics_types.py +94 -0
- vllm/engine/multiprocessing/__init__.py +148 -0
- vllm/engine/multiprocessing/client.py +681 -0
- vllm/engine/multiprocessing/engine.py +460 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +75 -0
- vllm/engine/output_processor/multi_step.py +216 -0
- vllm/engine/output_processor/single_step.py +145 -0
- vllm/engine/output_processor/stop_checker.py +131 -0
- vllm/engine/output_processor/util.py +28 -0
- vllm/engine/protocol.py +317 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +178 -0
- vllm/entrypoints/chat_utils.py +1299 -0
- vllm/entrypoints/cli/__init__.py +0 -0
- vllm/entrypoints/cli/benchmark/__init__.py +0 -0
- vllm/entrypoints/cli/benchmark/base.py +39 -0
- vllm/entrypoints/cli/benchmark/latency.py +30 -0
- vllm/entrypoints/cli/benchmark/main.py +54 -0
- vllm/entrypoints/cli/benchmark/serve.py +30 -0
- vllm/entrypoints/cli/benchmark/throughput.py +30 -0
- vllm/entrypoints/cli/collect_env.py +35 -0
- vllm/entrypoints/cli/main.py +65 -0
- vllm/entrypoints/cli/openai.py +205 -0
- vllm/entrypoints/cli/run_batch.py +62 -0
- vllm/entrypoints/cli/serve.py +328 -0
- vllm/entrypoints/cli/types.py +25 -0
- vllm/entrypoints/launcher.py +147 -0
- vllm/entrypoints/llm.py +1544 -0
- vllm/entrypoints/logger.py +50 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +1387 -0
- vllm/entrypoints/openai/cli_args.py +315 -0
- vllm/entrypoints/openai/logits_processors.py +90 -0
- vllm/entrypoints/openai/protocol.py +1913 -0
- vllm/entrypoints/openai/run_batch.py +463 -0
- vllm/entrypoints/openai/serving_chat.py +1221 -0
- vllm/entrypoints/openai/serving_classification.py +160 -0
- vllm/entrypoints/openai/serving_completion.py +592 -0
- vllm/entrypoints/openai/serving_embedding.py +201 -0
- vllm/entrypoints/openai/serving_engine.py +986 -0
- vllm/entrypoints/openai/serving_models.py +315 -0
- vllm/entrypoints/openai/serving_pooling.py +232 -0
- vllm/entrypoints/openai/serving_score.py +433 -0
- vllm/entrypoints/openai/serving_tokenization.py +157 -0
- vllm/entrypoints/openai/serving_transcription.py +424 -0
- vllm/entrypoints/openai/tool_parsers/__init__.py +23 -0
- vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
- vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
- vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
- vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
- vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
- vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
- vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
- vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
- vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
- vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
- vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
- vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
- vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
- vllm/entrypoints/score_utils.py +50 -0
- vllm/entrypoints/ssl.py +75 -0
- vllm/entrypoints/utils.py +233 -0
- vllm/env_override.py +41 -0
- vllm/envs.py +944 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/executor_base.py +401 -0
- vllm/executor/mp_distributed_executor.py +244 -0
- vllm/executor/msgspec_utils.py +30 -0
- vllm/executor/multiproc_worker_utils.py +313 -0
- vllm/executor/ray_distributed_executor.py +701 -0
- vllm/executor/ray_utils.py +399 -0
- vllm/executor/uniproc_executor.py +139 -0
- vllm/forward_context.py +179 -0
- vllm/inputs/__init__.py +41 -0
- vllm/inputs/data.py +331 -0
- vllm/inputs/parse.py +151 -0
- vllm/inputs/preprocess.py +909 -0
- vllm/inputs/registry.py +237 -0
- vllm/jsontree.py +80 -0
- vllm/logger.py +212 -0
- vllm/logging_utils/__init__.py +8 -0
- vllm/logging_utils/dump_input.py +85 -0
- vllm/logging_utils/formatter.py +18 -0
- vllm/logits_process.py +119 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +355 -0
- vllm/lora/layers.py +1285 -0
- vllm/lora/lora.py +199 -0
- vllm/lora/models.py +818 -0
- vllm/lora/ops/__init__.py +0 -0
- vllm/lora/ops/torch_ops/__init__.py +16 -0
- vllm/lora/ops/torch_ops/lora_ops.py +119 -0
- vllm/lora/ops/triton_ops/__init__.py +12 -0
- vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
- vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
- vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
- vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
- vllm/lora/ops/triton_ops/utils.py +120 -0
- vllm/lora/ops/xla_ops/__init__.py +7 -0
- vllm/lora/ops/xla_ops/lora_ops.py +145 -0
- vllm/lora/peft_helper.py +136 -0
- vllm/lora/punica_wrapper/__init__.py +10 -0
- vllm/lora/punica_wrapper/punica_base.py +485 -0
- vllm/lora/punica_wrapper/punica_cpu.py +349 -0
- vllm/lora/punica_wrapper/punica_gpu.py +290 -0
- vllm/lora/punica_wrapper/punica_hpu.py +145 -0
- vllm/lora/punica_wrapper/punica_selector.py +20 -0
- vllm/lora/punica_wrapper/punica_tpu.py +405 -0
- vllm/lora/punica_wrapper/utils.py +164 -0
- vllm/lora/request.py +99 -0
- vllm/lora/resolver.py +85 -0
- vllm/lora/utils.py +240 -0
- vllm/lora/worker_manager.py +259 -0
- vllm/model_executor/__init__.py +16 -0
- vllm/model_executor/custom_op.py +152 -0
- vllm/model_executor/guided_decoding/__init__.py +181 -0
- vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
- vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
- vllm/model_executor/guided_decoding/guided_fields.py +41 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
- vllm/model_executor/guided_decoding/utils.py +242 -0
- vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +369 -0
- vllm/model_executor/layers/fused_moe/__init__.py +54 -0
- vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +125 -0
- vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +117 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/README +12 -0
- vllm/model_executor/layers/fused_moe/cutlass_moe.py +461 -0
- vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +240 -0
- vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +240 -0
- vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +186 -0
- vllm/model_executor/layers/fused_moe/fused_batched_moe.py +775 -0
- vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +232 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +1724 -0
- vllm/model_executor/layers/fused_moe/layer.py +1535 -0
- vllm/model_executor/layers/fused_moe/modular_kernel.py +446 -0
- vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
- vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
- vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
- vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
- vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +159 -0
- vllm/model_executor/layers/fused_moe/prepare_finalize.py +69 -0
- vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +421 -0
- vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +117 -0
- vllm/model_executor/layers/fused_moe/utils.py +98 -0
- vllm/model_executor/layers/layernorm.py +288 -0
- vllm/model_executor/layers/lightning_attn.py +652 -0
- vllm/model_executor/layers/linear.py +1524 -0
- vllm/model_executor/layers/logits_processor.py +197 -0
- vllm/model_executor/layers/mamba/__init__.py +0 -0
- vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
- vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
- vllm/model_executor/layers/mamba/mamba_mixer2.py +616 -0
- vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
- vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
- vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
- vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
- vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
- vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
- vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
- vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
- vllm/model_executor/layers/pooler.py +350 -0
- vllm/model_executor/layers/quantization/__init__.py +157 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/auto_round.py +310 -0
- vllm/model_executor/layers/quantization/awq.py +194 -0
- vllm/model_executor/layers/quantization/awq_marlin.py +519 -0
- vllm/model_executor/layers/quantization/awq_triton.py +320 -0
- vllm/model_executor/layers/quantization/base_config.py +151 -0
- vllm/model_executor/layers/quantization/bitblas.py +461 -0
- vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
- vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
- vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +668 -0
- vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1260 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +93 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +178 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
- vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
- vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
- vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
- vllm/model_executor/layers/quantization/experts_int8.py +196 -0
- vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
- vllm/model_executor/layers/quantization/fp8.py +906 -0
- vllm/model_executor/layers/quantization/gguf.py +565 -0
- vllm/model_executor/layers/quantization/gptq.py +278 -0
- vllm/model_executor/layers/quantization/gptq_bitblas.py +445 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +648 -0
- vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
- vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
- vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
- vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +120 -0
- vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
- vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
- vllm/model_executor/layers/quantization/kv_cache.py +139 -0
- vllm/model_executor/layers/quantization/marlin.py +261 -0
- vllm/model_executor/layers/quantization/modelopt.py +737 -0
- vllm/model_executor/layers/quantization/moe_wna16.py +449 -0
- vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
- vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
- vllm/model_executor/layers/quantization/qqq.py +275 -0
- vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
- vllm/model_executor/layers/quantization/quark/quark.py +441 -0
- vllm/model_executor/layers/quantization/quark/quark_moe.py +237 -0
- vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
- vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
- vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +146 -0
- vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
- vllm/model_executor/layers/quantization/quark/utils.py +105 -0
- vllm/model_executor/layers/quantization/schema.py +86 -0
- vllm/model_executor/layers/quantization/torchao.py +161 -0
- vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
- vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
- vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
- vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
- vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
- vllm/model_executor/layers/quantization/utils/fp8_utils.py +618 -0
- vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
- vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
- vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
- vllm/model_executor/layers/quantization/utils/machete_utils.py +33 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
- vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
- vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
- vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +104 -0
- vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
- vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
- vllm/model_executor/layers/rejection_sampler.py +406 -0
- vllm/model_executor/layers/resampler.py +270 -0
- vllm/model_executor/layers/rotary_embedding.py +1862 -0
- vllm/model_executor/layers/sampler.py +1204 -0
- vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
- vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
- vllm/model_executor/layers/utils.py +95 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
- vllm/model_executor/model_loader/__init__.py +76 -0
- vllm/model_executor/model_loader/base_loader.py +43 -0
- vllm/model_executor/model_loader/bitsandbytes_loader.py +570 -0
- vllm/model_executor/model_loader/default_loader.py +282 -0
- vllm/model_executor/model_loader/dummy_loader.py +27 -0
- vllm/model_executor/model_loader/gguf_loader.py +120 -0
- vllm/model_executor/model_loader/neuron.py +476 -0
- vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
- vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
- vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
- vllm/model_executor/model_loader/tensorizer.py +600 -0
- vllm/model_executor/model_loader/tensorizer_loader.py +123 -0
- vllm/model_executor/model_loader/tpu.py +112 -0
- vllm/model_executor/model_loader/utils.py +302 -0
- vllm/model_executor/model_loader/weight_utils.py +782 -0
- vllm/model_executor/models/__init__.py +28 -0
- vllm/model_executor/models/adapters.py +248 -0
- vllm/model_executor/models/aimv2.py +246 -0
- vllm/model_executor/models/arctic.py +559 -0
- vllm/model_executor/models/aria.py +657 -0
- vllm/model_executor/models/aya_vision.py +466 -0
- vllm/model_executor/models/baichuan.py +474 -0
- vllm/model_executor/models/bamba.py +543 -0
- vllm/model_executor/models/bart.py +938 -0
- vllm/model_executor/models/bert.py +523 -0
- vllm/model_executor/models/bert_with_rope.py +769 -0
- vllm/model_executor/models/blip.py +339 -0
- vllm/model_executor/models/blip2.py +718 -0
- vllm/model_executor/models/bloom.py +373 -0
- vllm/model_executor/models/chameleon.py +1136 -0
- vllm/model_executor/models/chatglm.py +478 -0
- vllm/model_executor/models/clip.py +407 -0
- vllm/model_executor/models/commandr.py +472 -0
- vllm/model_executor/models/constant_size_cache.py +137 -0
- vllm/model_executor/models/dbrx.py +472 -0
- vllm/model_executor/models/deepseek.py +486 -0
- vllm/model_executor/models/deepseek_mtp.py +269 -0
- vllm/model_executor/models/deepseek_v2.py +843 -0
- vllm/model_executor/models/deepseek_vl2.py +648 -0
- vllm/model_executor/models/eagle.py +260 -0
- vllm/model_executor/models/exaone.py +551 -0
- vllm/model_executor/models/fairseq2_llama.py +154 -0
- vllm/model_executor/models/falcon.py +510 -0
- vllm/model_executor/models/falcon_h1.py +685 -0
- vllm/model_executor/models/florence2.py +1103 -0
- vllm/model_executor/models/fuyu.py +389 -0
- vllm/model_executor/models/gemma.py +425 -0
- vllm/model_executor/models/gemma2.py +425 -0
- vllm/model_executor/models/gemma3.py +533 -0
- vllm/model_executor/models/gemma3_mm.py +709 -0
- vllm/model_executor/models/glm.py +23 -0
- vllm/model_executor/models/glm4.py +305 -0
- vllm/model_executor/models/glm4v.py +648 -0
- vllm/model_executor/models/gpt2.py +328 -0
- vllm/model_executor/models/gpt_bigcode.py +335 -0
- vllm/model_executor/models/gpt_j.py +339 -0
- vllm/model_executor/models/gpt_neox.py +332 -0
- vllm/model_executor/models/granite.py +493 -0
- vllm/model_executor/models/granite_speech.py +779 -0
- vllm/model_executor/models/granitemoe.py +437 -0
- vllm/model_executor/models/granitemoehybrid.py +586 -0
- vllm/model_executor/models/granitemoeshared.py +341 -0
- vllm/model_executor/models/gritlm.py +224 -0
- vllm/model_executor/models/grok1.py +546 -0
- vllm/model_executor/models/h2ovl.py +546 -0
- vllm/model_executor/models/idefics2_vision_model.py +389 -0
- vllm/model_executor/models/idefics3.py +776 -0
- vllm/model_executor/models/interfaces.py +572 -0
- vllm/model_executor/models/interfaces_base.py +164 -0
- vllm/model_executor/models/intern_vit.py +480 -0
- vllm/model_executor/models/internlm2.py +455 -0
- vllm/model_executor/models/internlm2_ve.py +147 -0
- vllm/model_executor/models/internvl.py +1418 -0
- vllm/model_executor/models/jais.py +373 -0
- vllm/model_executor/models/jamba.py +592 -0
- vllm/model_executor/models/kimi_vl.py +577 -0
- vllm/model_executor/models/llama.py +644 -0
- vllm/model_executor/models/llama4.py +532 -0
- vllm/model_executor/models/llama_eagle.py +165 -0
- vllm/model_executor/models/llama_eagle3.py +263 -0
- vllm/model_executor/models/llava.py +866 -0
- vllm/model_executor/models/llava_next.py +586 -0
- vllm/model_executor/models/llava_next_video.py +471 -0
- vllm/model_executor/models/llava_onevision.py +956 -0
- vllm/model_executor/models/mamba.py +273 -0
- vllm/model_executor/models/mamba2.py +308 -0
- vllm/model_executor/models/mamba_cache.py +76 -0
- vllm/model_executor/models/medusa.py +219 -0
- vllm/model_executor/models/mimo.py +192 -0
- vllm/model_executor/models/mimo_mtp.py +285 -0
- vllm/model_executor/models/minicpm.py +592 -0
- vllm/model_executor/models/minicpm3.py +230 -0
- vllm/model_executor/models/minicpm_eagle.py +391 -0
- vllm/model_executor/models/minicpmo.py +759 -0
- vllm/model_executor/models/minicpmv.py +1287 -0
- vllm/model_executor/models/minimax_cache.py +36 -0
- vllm/model_executor/models/minimax_text_01.py +1301 -0
- vllm/model_executor/models/minimax_vl_01.py +364 -0
- vllm/model_executor/models/mistral3.py +604 -0
- vllm/model_executor/models/mixtral.py +488 -0
- vllm/model_executor/models/mixtral_quant.py +453 -0
- vllm/model_executor/models/mllama.py +1624 -0
- vllm/model_executor/models/mllama4.py +938 -0
- vllm/model_executor/models/mlp_speculator.py +206 -0
- vllm/model_executor/models/modernbert.py +331 -0
- vllm/model_executor/models/module_mapping.py +72 -0
- vllm/model_executor/models/molmo.py +1568 -0
- vllm/model_executor/models/moonvit.py +630 -0
- vllm/model_executor/models/mpt.py +331 -0
- vllm/model_executor/models/nemotron.py +508 -0
- vllm/model_executor/models/nemotron_h.py +573 -0
- vllm/model_executor/models/nemotron_nas.py +484 -0
- vllm/model_executor/models/nvlm_d.py +216 -0
- vllm/model_executor/models/olmo.py +389 -0
- vllm/model_executor/models/olmo2.py +414 -0
- vllm/model_executor/models/olmoe.py +468 -0
- vllm/model_executor/models/opt.py +412 -0
- vllm/model_executor/models/orion.py +349 -0
- vllm/model_executor/models/ovis.py +567 -0
- vllm/model_executor/models/paligemma.py +398 -0
- vllm/model_executor/models/persimmon.py +344 -0
- vllm/model_executor/models/phi.py +356 -0
- vllm/model_executor/models/phi3.py +19 -0
- vllm/model_executor/models/phi3_small.py +465 -0
- vllm/model_executor/models/phi3v.py +723 -0
- vllm/model_executor/models/phi4mm.py +1246 -0
- vllm/model_executor/models/phi4mm_audio.py +1233 -0
- vllm/model_executor/models/phi4mm_utils.py +1884 -0
- vllm/model_executor/models/phimoe.py +665 -0
- vllm/model_executor/models/pixtral.py +1316 -0
- vllm/model_executor/models/plamo2.py +738 -0
- vllm/model_executor/models/prithvi_geospatial_mae.py +232 -0
- vllm/model_executor/models/qwen.py +362 -0
- vllm/model_executor/models/qwen2.py +497 -0
- vllm/model_executor/models/qwen2_5_omni_thinker.py +904 -0
- vllm/model_executor/models/qwen2_5_vl.py +1166 -0
- vllm/model_executor/models/qwen2_audio.py +410 -0
- vllm/model_executor/models/qwen2_moe.py +540 -0
- vllm/model_executor/models/qwen2_rm.py +132 -0
- vllm/model_executor/models/qwen2_vl.py +1405 -0
- vllm/model_executor/models/qwen3.py +321 -0
- vllm/model_executor/models/qwen3_moe.py +535 -0
- vllm/model_executor/models/qwen_vl.py +785 -0
- vllm/model_executor/models/registry.py +622 -0
- vllm/model_executor/models/roberta.py +276 -0
- vllm/model_executor/models/siglip.py +524 -0
- vllm/model_executor/models/skyworkr1v.py +951 -0
- vllm/model_executor/models/smolvlm.py +52 -0
- vllm/model_executor/models/solar.py +506 -0
- vllm/model_executor/models/stablelm.py +343 -0
- vllm/model_executor/models/starcoder2.py +356 -0
- vllm/model_executor/models/tarsier.py +643 -0
- vllm/model_executor/models/telechat2.py +140 -0
- vllm/model_executor/models/teleflm.py +79 -0
- vllm/model_executor/models/transformers.py +508 -0
- vllm/model_executor/models/ultravox.py +656 -0
- vllm/model_executor/models/utils.py +731 -0
- vllm/model_executor/models/vision.py +147 -0
- vllm/model_executor/models/whisper.py +747 -0
- vllm/model_executor/models/zamba2.py +1009 -0
- vllm/model_executor/parameter.py +459 -0
- vllm/model_executor/pooling_metadata.py +72 -0
- vllm/model_executor/sampling_metadata.py +597 -0
- vllm/model_executor/utils.py +77 -0
- vllm/multimodal/__init__.py +33 -0
- vllm/multimodal/audio.py +106 -0
- vllm/multimodal/base.py +219 -0
- vllm/multimodal/hasher.py +118 -0
- vllm/multimodal/image.py +97 -0
- vllm/multimodal/inputs.py +876 -0
- vllm/multimodal/parse.py +461 -0
- vllm/multimodal/processing.py +1895 -0
- vllm/multimodal/profiling.py +258 -0
- vllm/multimodal/registry.py +331 -0
- vllm/multimodal/utils.py +436 -0
- vllm/multimodal/video.py +198 -0
- vllm/outputs.py +512 -0
- vllm/platforms/__init__.py +291 -0
- vllm/platforms/cpu.py +266 -0
- vllm/platforms/cuda.py +526 -0
- vllm/platforms/hpu.py +106 -0
- vllm/platforms/interface.py +538 -0
- vllm/platforms/neuron.py +150 -0
- vllm/platforms/rocm.py +435 -0
- vllm/platforms/tpu.py +216 -0
- vllm/platforms/xpu.py +156 -0
- vllm/plugins/__init__.py +94 -0
- vllm/plugins/lora_resolvers/README.md +15 -0
- vllm/plugins/lora_resolvers/__init__.py +0 -0
- vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
- vllm/pooling_params.py +54 -0
- vllm/profiler/__init__.py +0 -0
- vllm/profiler/layerwise_profile.py +375 -0
- vllm/profiler/utils.py +148 -0
- vllm/prompt_adapter/__init__.py +0 -0
- vllm/prompt_adapter/layers.py +83 -0
- vllm/prompt_adapter/models.py +358 -0
- vllm/prompt_adapter/request.py +37 -0
- vllm/prompt_adapter/utils.py +98 -0
- vllm/prompt_adapter/worker_manager.py +179 -0
- vllm/py.typed +2 -0
- vllm/reasoning/__init__.py +15 -0
- vllm/reasoning/abs_reasoning_parsers.py +192 -0
- vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
- vllm/reasoning/granite_reasoning_parser.py +363 -0
- vllm/reasoning/qwen3_reasoning_parser.py +151 -0
- vllm/sampling_params.py +602 -0
- vllm/scalar_type.py +347 -0
- vllm/scripts.py +15 -0
- vllm/sequence.py +1568 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +506 -0
- vllm/spec_decode/draft_model_runner.py +349 -0
- vllm/spec_decode/interfaces.py +99 -0
- vllm/spec_decode/medusa_worker.py +138 -0
- vllm/spec_decode/metrics.py +213 -0
- vllm/spec_decode/mlp_speculator_worker.py +94 -0
- vllm/spec_decode/mqa_scorer.py +160 -0
- vllm/spec_decode/multi_step_worker.py +423 -0
- vllm/spec_decode/ngram_worker.py +196 -0
- vllm/spec_decode/proposer_worker_base.py +59 -0
- vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
- vllm/spec_decode/spec_decode_worker.py +1326 -0
- vllm/spec_decode/target_model_runner.py +45 -0
- vllm/spec_decode/top1_proposer.py +275 -0
- vllm/spec_decode/util.py +277 -0
- vllm/test_utils.py +130 -0
- vllm/third_party/__init__.py +0 -0
- vllm/third_party/pynvml.py +6140 -0
- vllm/tracing.py +131 -0
- vllm/transformers_utils/__init__.py +24 -0
- vllm/transformers_utils/chat_templates/__init__.py +5 -0
- vllm/transformers_utils/chat_templates/registry.py +60 -0
- vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
- vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
- vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
- vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
- vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
- vllm/transformers_utils/config.py +887 -0
- vllm/transformers_utils/configs/__init__.py +61 -0
- vllm/transformers_utils/configs/arctic.py +207 -0
- vllm/transformers_utils/configs/chatglm.py +72 -0
- vllm/transformers_utils/configs/cohere2.py +195 -0
- vllm/transformers_utils/configs/dbrx.py +280 -0
- vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
- vllm/transformers_utils/configs/eagle.py +85 -0
- vllm/transformers_utils/configs/exaone.py +190 -0
- vllm/transformers_utils/configs/falcon.py +90 -0
- vllm/transformers_utils/configs/h2ovl.py +16 -0
- vllm/transformers_utils/configs/internvl.py +54 -0
- vllm/transformers_utils/configs/jais.py +238 -0
- vllm/transformers_utils/configs/kimi_vl.py +37 -0
- vllm/transformers_utils/configs/medusa.py +63 -0
- vllm/transformers_utils/configs/minimax_text_01.py +70 -0
- vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
- vllm/transformers_utils/configs/mllama.py +31 -0
- vllm/transformers_utils/configs/mlp_speculator.py +68 -0
- vllm/transformers_utils/configs/moonvit.py +33 -0
- vllm/transformers_utils/configs/mpt.py +180 -0
- vllm/transformers_utils/configs/nemotron.py +205 -0
- vllm/transformers_utils/configs/nemotron_h.py +258 -0
- vllm/transformers_utils/configs/nvlm_d.py +15 -0
- vllm/transformers_utils/configs/ovis.py +184 -0
- vllm/transformers_utils/configs/skyworkr1v.py +54 -0
- vllm/transformers_utils/configs/solar.py +247 -0
- vllm/transformers_utils/configs/telechat2.py +64 -0
- vllm/transformers_utils/configs/ultravox.py +108 -0
- vllm/transformers_utils/detokenizer.py +168 -0
- vllm/transformers_utils/detokenizer_utils.py +189 -0
- vllm/transformers_utils/processor.py +221 -0
- vllm/transformers_utils/processors/__init__.py +8 -0
- vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
- vllm/transformers_utils/processors/ovis.py +420 -0
- vllm/transformers_utils/s3_utils.py +162 -0
- vllm/transformers_utils/tokenizer.py +302 -0
- vllm/transformers_utils/tokenizer_base.py +149 -0
- vllm/transformers_utils/tokenizer_group.py +120 -0
- vllm/transformers_utils/tokenizers/__init__.py +10 -0
- vllm/transformers_utils/tokenizers/mistral.py +493 -0
- vllm/transformers_utils/utils.py +99 -0
- vllm/triton_utils/__init__.py +14 -0
- vllm/triton_utils/importing.py +50 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +256 -0
- vllm/utils.py +2910 -0
- vllm/v1/__init__.py +0 -0
- vllm/v1/attention/__init__.py +0 -0
- vllm/v1/attention/backends/__init__.py +0 -0
- vllm/v1/attention/backends/cpu_attn.py +163 -0
- vllm/v1/attention/backends/flash_attn.py +869 -0
- vllm/v1/attention/backends/flashinfer.py +651 -0
- vllm/v1/attention/backends/flex_attention.py +477 -0
- vllm/v1/attention/backends/mla/__init__.py +0 -0
- vllm/v1/attention/backends/mla/common.py +931 -0
- vllm/v1/attention/backends/mla/cutlass_mla.py +97 -0
- vllm/v1/attention/backends/mla/flashmla.py +152 -0
- vllm/v1/attention/backends/mla/rocm_aiter_mla.py +220 -0
- vllm/v1/attention/backends/mla/triton_mla.py +120 -0
- vllm/v1/attention/backends/pallas.py +240 -0
- vllm/v1/attention/backends/triton_attn.py +285 -0
- vllm/v1/attention/backends/utils.py +52 -0
- vllm/v1/core/__init__.py +0 -0
- vllm/v1/core/block_pool.py +349 -0
- vllm/v1/core/encoder_cache_manager.py +150 -0
- vllm/v1/core/kv_cache_coordinator.py +363 -0
- vllm/v1/core/kv_cache_manager.py +392 -0
- vllm/v1/core/kv_cache_utils.py +996 -0
- vllm/v1/core/sched/__init__.py +0 -0
- vllm/v1/core/sched/interface.py +150 -0
- vllm/v1/core/sched/output.py +154 -0
- vllm/v1/core/sched/scheduler.py +1044 -0
- vllm/v1/core/sched/utils.py +23 -0
- vllm/v1/core/single_type_kv_cache_manager.py +403 -0
- vllm/v1/engine/__init__.py +173 -0
- vllm/v1/engine/async_llm.py +558 -0
- vllm/v1/engine/coordinator.py +253 -0
- vllm/v1/engine/core.py +961 -0
- vllm/v1/engine/core_client.py +1129 -0
- vllm/v1/engine/detokenizer.py +261 -0
- vllm/v1/engine/exceptions.py +17 -0
- vllm/v1/engine/llm_engine.py +317 -0
- vllm/v1/engine/logprobs.py +199 -0
- vllm/v1/engine/mm_input_cache.py +91 -0
- vllm/v1/engine/output_processor.py +428 -0
- vllm/v1/engine/parallel_sampling.py +133 -0
- vllm/v1/engine/processor.py +407 -0
- vllm/v1/executor/__init__.py +0 -0
- vllm/v1/executor/abstract.py +113 -0
- vllm/v1/executor/multiproc_executor.py +537 -0
- vllm/v1/executor/ray_distributed_executor.py +62 -0
- vllm/v1/kv_cache_interface.py +194 -0
- vllm/v1/metrics/__init__.py +0 -0
- vllm/v1/metrics/loggers.py +523 -0
- vllm/v1/metrics/prometheus.py +82 -0
- vllm/v1/metrics/ray_wrappers.py +131 -0
- vllm/v1/metrics/reader.py +246 -0
- vllm/v1/metrics/stats.py +239 -0
- vllm/v1/outputs.py +116 -0
- vllm/v1/request.py +193 -0
- vllm/v1/sample/__init__.py +0 -0
- vllm/v1/sample/metadata.py +44 -0
- vllm/v1/sample/ops/__init__.py +0 -0
- vllm/v1/sample/ops/bad_words.py +39 -0
- vllm/v1/sample/ops/penalties.py +59 -0
- vllm/v1/sample/ops/topk_topp_sampler.py +293 -0
- vllm/v1/sample/rejection_sampler.py +631 -0
- vllm/v1/sample/sampler.py +286 -0
- vllm/v1/sample/tpu/__init__.py +0 -0
- vllm/v1/sample/tpu/metadata.py +124 -0
- vllm/v1/sample/tpu/sampler.py +145 -0
- vllm/v1/serial_utils.py +315 -0
- vllm/v1/spec_decode/__init__.py +0 -0
- vllm/v1/spec_decode/eagle.py +432 -0
- vllm/v1/spec_decode/medusa.py +62 -0
- vllm/v1/spec_decode/metadata.py +62 -0
- vllm/v1/spec_decode/metrics.py +178 -0
- vllm/v1/spec_decode/ngram_proposer.py +132 -0
- vllm/v1/spec_decode/utils.py +46 -0
- vllm/v1/structured_output/__init__.py +222 -0
- vllm/v1/structured_output/backend_guidance.py +245 -0
- vllm/v1/structured_output/backend_types.py +134 -0
- vllm/v1/structured_output/backend_xgrammar.py +318 -0
- vllm/v1/structured_output/request.py +86 -0
- vllm/v1/structured_output/utils.py +175 -0
- vllm/v1/utils.py +743 -0
- vllm/v1/worker/__init__.py +0 -0
- vllm/v1/worker/block_table.py +142 -0
- vllm/v1/worker/cpu_model_runner.py +86 -0
- vllm/v1/worker/cpu_worker.py +152 -0
- vllm/v1/worker/gpu_input_batch.py +681 -0
- vllm/v1/worker/gpu_model_runner.py +2320 -0
- vllm/v1/worker/gpu_worker.py +393 -0
- vllm/v1/worker/lora_model_runner_mixin.py +173 -0
- vllm/v1/worker/tpu_model_runner.py +1673 -0
- vllm/v1/worker/tpu_worker.py +299 -0
- vllm/v1/worker/utils.py +111 -0
- vllm/v1/worker/worker_base.py +65 -0
- vllm/version.py +41 -0
- vllm/vllm_flash_attn/.gitkeep +0 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +145 -0
- vllm/worker/cpu_enc_dec_model_runner.py +326 -0
- vllm/worker/cpu_model_runner.py +671 -0
- vllm/worker/cpu_pooling_model_runner.py +125 -0
- vllm/worker/cpu_worker.py +450 -0
- vllm/worker/enc_dec_model_runner.py +555 -0
- vllm/worker/hpu_model_runner.py +2320 -0
- vllm/worker/hpu_worker.py +484 -0
- vllm/worker/model_runner.py +2178 -0
- vllm/worker/model_runner_base.py +282 -0
- vllm/worker/multi_step_hpu_worker.py +123 -0
- vllm/worker/multi_step_model_runner.py +911 -0
- vllm/worker/multi_step_neuron_model_runner.py +84 -0
- vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
- vllm/worker/multi_step_tpu_worker.py +108 -0
- vllm/worker/multi_step_worker.py +197 -0
- vllm/worker/neuron_model_runner.py +460 -0
- vllm/worker/neuron_worker.py +193 -0
- vllm/worker/neuronx_distributed_model_runner.py +294 -0
- vllm/worker/pooling_model_runner.py +211 -0
- vllm/worker/tpu_model_runner.py +909 -0
- vllm/worker/tpu_worker.py +337 -0
- vllm/worker/utils.py +53 -0
- vllm/worker/worker.py +577 -0
- vllm/worker/worker_base.py +646 -0
- vllm/worker/xpu_model_runner.py +606 -0
- vllm/worker/xpu_worker.py +186 -0
- vllm_cpu_amxbf16-0.9.1.dist-info/METADATA +305 -0
- vllm_cpu_amxbf16-0.9.1.dist-info/RECORD +1197 -0
- vllm_cpu_amxbf16-0.9.1.dist-info/WHEEL +5 -0
- vllm_cpu_amxbf16-0.9.1.dist-info/entry_points.txt +5 -0
- vllm_cpu_amxbf16-0.9.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1135 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
3
|
+
"""Token blocks."""
|
|
4
|
+
import sys
|
|
5
|
+
from bisect import bisect_left
|
|
6
|
+
from os.path import commonprefix
|
|
7
|
+
from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
|
|
8
|
+
Tuple)
|
|
9
|
+
|
|
10
|
+
from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
|
|
11
|
+
get_all_blocks_recursively)
|
|
12
|
+
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
|
|
13
|
+
DeviceAwareBlockAllocator)
|
|
14
|
+
from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
|
|
15
|
+
NaiveBlockAllocator)
|
|
16
|
+
from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
|
|
17
|
+
from vllm.logger import init_logger
|
|
18
|
+
from vllm.sequence import Sequence
|
|
19
|
+
|
|
20
|
+
PrefixHash = int
|
|
21
|
+
|
|
22
|
+
# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
|
|
23
|
+
# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
|
|
24
|
+
# then we know this block hasn't been accessed yet.
|
|
25
|
+
_DEFAULT_LAST_ACCESSED_TIME = -1
|
|
26
|
+
|
|
27
|
+
logger = init_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BlockTracker:
|
|
31
|
+
"""Used to track the status of a block inside the prefix caching allocator
|
|
32
|
+
"""
|
|
33
|
+
__slots__ = ("active", "last_accessed", "computed")
|
|
34
|
+
|
|
35
|
+
def reset(self):
|
|
36
|
+
self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
|
|
37
|
+
self.computed: bool = False
|
|
38
|
+
|
|
39
|
+
def __init__(self):
|
|
40
|
+
self.active: bool = False
|
|
41
|
+
self.reset()
|
|
42
|
+
|
|
43
|
+
def enable(self):
|
|
44
|
+
assert not self.active
|
|
45
|
+
self.active = True
|
|
46
|
+
self.reset()
|
|
47
|
+
|
|
48
|
+
def disable(self):
|
|
49
|
+
assert self.active
|
|
50
|
+
self.active = False
|
|
51
|
+
self.reset()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class PrefixCachingBlockAllocator(BlockAllocator):
|
|
55
|
+
"""A block allocator that implements prefix caching.
|
|
56
|
+
|
|
57
|
+
The PrefixCachingBlockAllocator maintains a cache of blocks based on their
|
|
58
|
+
content hash. It reuses blocks with the same content hash to avoid redundant
|
|
59
|
+
memory allocation. The allocator also supports copy-on-write operations.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
num_blocks (int): The total number of blocks to manage.
|
|
63
|
+
block_size (int): The size of each block in tokens.
|
|
64
|
+
block_ids(Optional[Iterable[int]], optional): An optional iterable of
|
|
65
|
+
block IDs. If not provided, block IDs will be assigned sequentially
|
|
66
|
+
from 0 to num_blocks - 1.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
# Note that we use 'None' as a string here instead of None because
|
|
70
|
+
# as of Python 3.12, hash(None) returns a constant predictable value.
|
|
71
|
+
# This could possibly make it easier to find and exploit hash
|
|
72
|
+
# collisions. 'None' as a string will be hashed differently per process,
|
|
73
|
+
# but consistently within the same process. This is the same as the
|
|
74
|
+
# behavior of None prior to Python 3.12.
|
|
75
|
+
_none_hash: int = hash('None')
|
|
76
|
+
|
|
77
|
+
# Implements Block.Factory.
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
num_blocks: int,
|
|
81
|
+
block_size: int,
|
|
82
|
+
block_ids: Optional[Iterable[int]] = None,
|
|
83
|
+
eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
|
|
84
|
+
):
|
|
85
|
+
if block_ids is None:
|
|
86
|
+
block_ids = range(num_blocks)
|
|
87
|
+
|
|
88
|
+
self._block_size = block_size
|
|
89
|
+
|
|
90
|
+
# A mapping of prefix hash to block index. All blocks which have a
|
|
91
|
+
# prefix hash will be in this dict, even if they have refcount 0.
|
|
92
|
+
self._cached_blocks: Dict[PrefixHash, BlockId] = {}
|
|
93
|
+
|
|
94
|
+
# A list of immutable block IDs that have been touched by scheduler
|
|
95
|
+
# and should be marked as computed after an entire batch of sequences
|
|
96
|
+
# are scheduled.
|
|
97
|
+
self._touched_blocks: Set[BlockId] = set()
|
|
98
|
+
|
|
99
|
+
# Used to track status of each physical block id
|
|
100
|
+
self._block_tracker: Dict[BlockId, BlockTracker] = {}
|
|
101
|
+
for block_id in block_ids:
|
|
102
|
+
self._block_tracker[block_id] = BlockTracker()
|
|
103
|
+
|
|
104
|
+
# Pre-allocate "num_blocks * extra_factor" block objects.
|
|
105
|
+
# The "* extra_factor" is a buffer to allow more block objects
|
|
106
|
+
# than physical blocks
|
|
107
|
+
extra_factor = 4
|
|
108
|
+
self._block_pool = BlockPool(self._block_size, self._create_block,
|
|
109
|
+
self, num_blocks * extra_factor)
|
|
110
|
+
|
|
111
|
+
# An allocator for blocks that do not have prefix hashes.
|
|
112
|
+
self._hashless_allocator = NaiveBlockAllocator(
|
|
113
|
+
create_block=self._create_block, # type: ignore
|
|
114
|
+
num_blocks=num_blocks,
|
|
115
|
+
block_size=block_size,
|
|
116
|
+
block_ids=block_ids,
|
|
117
|
+
block_pool=self._block_pool, # Share block pool here
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Evitor used to maintain how we want to handle those computed blocks
|
|
121
|
+
# if we find memory pressure is high.
|
|
122
|
+
self.eviction_policy = eviction_policy
|
|
123
|
+
self.evictor: Evictor = make_evictor(self.eviction_policy)
|
|
124
|
+
|
|
125
|
+
# We share the refcounter between allocators. This allows us to promote
|
|
126
|
+
# blocks originally allocated in the hashless allocator to immutable
|
|
127
|
+
# blocks.
|
|
128
|
+
self._refcounter = self._hashless_allocator.refcounter
|
|
129
|
+
|
|
130
|
+
self._cow_tracker = CopyOnWriteTracker(
|
|
131
|
+
refcounter=self._refcounter.as_readonly())
|
|
132
|
+
|
|
133
|
+
self.metric_data = CacheMetricData()
|
|
134
|
+
|
|
135
|
+
def _create_block(
|
|
136
|
+
self,
|
|
137
|
+
prev_block: Optional[Block],
|
|
138
|
+
token_ids: List[int],
|
|
139
|
+
block_size: int,
|
|
140
|
+
allocator: BlockAllocator,
|
|
141
|
+
block_id: Optional[int] = None,
|
|
142
|
+
computed: bool = False,
|
|
143
|
+
extra_hash: Optional[int] = None,
|
|
144
|
+
) -> Block:
|
|
145
|
+
# Bind block to self.
|
|
146
|
+
allocator = self
|
|
147
|
+
|
|
148
|
+
return PrefixCachingBlock(
|
|
149
|
+
prev_block=prev_block,
|
|
150
|
+
token_ids=token_ids,
|
|
151
|
+
block_size=block_size,
|
|
152
|
+
block_id=block_id,
|
|
153
|
+
allocator=allocator,
|
|
154
|
+
computed=computed,
|
|
155
|
+
extra_hash=extra_hash,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def allocate_immutable_block(self,
|
|
159
|
+
prev_block: Optional[Block],
|
|
160
|
+
token_ids: List[int],
|
|
161
|
+
extra_hash: Optional[int] = None,
|
|
162
|
+
device: Optional[Device] = None) -> Block:
|
|
163
|
+
"""Allocates an immutable block with the given token IDs, reusing cached
|
|
164
|
+
blocks if possible.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
prev_block (Optional[Block]): The previous block in the sequence.
|
|
168
|
+
token_ids (List[int]): The token IDs to be stored in the block.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Block: The allocated immutable block.
|
|
172
|
+
"""
|
|
173
|
+
assert device is None
|
|
174
|
+
assert_prefix_caching_block_or_none(prev_block)
|
|
175
|
+
|
|
176
|
+
# First, try to create a block that points to cached data
|
|
177
|
+
block = self._block_pool.init_block(prev_block=prev_block,
|
|
178
|
+
token_ids=token_ids,
|
|
179
|
+
block_size=self._block_size,
|
|
180
|
+
physical_block_id=None,
|
|
181
|
+
extra_hash=extra_hash)
|
|
182
|
+
assert block.content_hash is not None
|
|
183
|
+
|
|
184
|
+
cached_block_id = self._cached_blocks.get(block.content_hash, None)
|
|
185
|
+
if cached_block_id is not None:
|
|
186
|
+
self.metric_data.query(hit=True)
|
|
187
|
+
block.block_id = cached_block_id
|
|
188
|
+
self._incr_refcount_cached_block(block)
|
|
189
|
+
return block
|
|
190
|
+
self.metric_data.query(hit=False)
|
|
191
|
+
self._block_pool.free_block(block)
|
|
192
|
+
|
|
193
|
+
# No cached block => Allocate a new block
|
|
194
|
+
block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
|
|
195
|
+
block.append_token_ids(token_ids)
|
|
196
|
+
return block
|
|
197
|
+
|
|
198
|
+
def allocate_immutable_blocks(
|
|
199
|
+
self,
|
|
200
|
+
prev_block: Optional[Block],
|
|
201
|
+
block_token_ids: List[List[int]],
|
|
202
|
+
extra_hash: Optional[int] = None,
|
|
203
|
+
device: Optional[Device] = None) -> List[Block]:
|
|
204
|
+
blocks = []
|
|
205
|
+
for token_ids in block_token_ids:
|
|
206
|
+
prev_block = self.allocate_immutable_block(prev_block=prev_block,
|
|
207
|
+
token_ids=token_ids,
|
|
208
|
+
device=device,
|
|
209
|
+
extra_hash=extra_hash)
|
|
210
|
+
blocks.append(prev_block)
|
|
211
|
+
return blocks
|
|
212
|
+
|
|
213
|
+
def allocate_mutable_block(self,
|
|
214
|
+
prev_block: Optional[Block],
|
|
215
|
+
extra_hash: Optional[int] = None,
|
|
216
|
+
device: Optional[Device] = None) -> Block:
|
|
217
|
+
"""Allocates a mutable block. If there are no free blocks, this will
|
|
218
|
+
evict unused cached blocks.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
prev_block (Block): The previous block in the sequence.
|
|
222
|
+
None is not allowed unlike it is super class.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Block: The allocated mutable block.
|
|
226
|
+
"""
|
|
227
|
+
assert device is None
|
|
228
|
+
assert_prefix_caching_block_or_none(prev_block)
|
|
229
|
+
|
|
230
|
+
block_id = self._allocate_block_id()
|
|
231
|
+
block = self._block_pool.init_block(prev_block=prev_block,
|
|
232
|
+
token_ids=[],
|
|
233
|
+
block_size=self._block_size,
|
|
234
|
+
physical_block_id=block_id,
|
|
235
|
+
extra_hash=extra_hash)
|
|
236
|
+
assert not block.computed
|
|
237
|
+
assert block.content_hash is None
|
|
238
|
+
return block
|
|
239
|
+
|
|
240
|
+
def _incr_refcount_cached_block(self, block: Block) -> None:
|
|
241
|
+
# Set this block to be "computed" since it is pointing to a
|
|
242
|
+
# cached block id (which was already computed)
|
|
243
|
+
block.computed = True
|
|
244
|
+
|
|
245
|
+
block_id = block.block_id
|
|
246
|
+
assert block_id is not None
|
|
247
|
+
|
|
248
|
+
refcount = self._refcounter.incr(block_id)
|
|
249
|
+
if refcount == 1:
|
|
250
|
+
# In case a cached block was evicted, restore its tracking
|
|
251
|
+
if block_id in self.evictor:
|
|
252
|
+
self.evictor.remove(block_id)
|
|
253
|
+
|
|
254
|
+
self._track_block_id(block_id, computed=True)
|
|
255
|
+
|
|
256
|
+
def _decr_refcount_cached_block(self, block: Block) -> None:
|
|
257
|
+
# Ensure this is immutable/cached block
|
|
258
|
+
assert block.content_hash is not None
|
|
259
|
+
|
|
260
|
+
block_id = block.block_id
|
|
261
|
+
assert block_id is not None
|
|
262
|
+
|
|
263
|
+
refcount = self._refcounter.decr(block_id)
|
|
264
|
+
if refcount > 0:
|
|
265
|
+
block.block_id = None
|
|
266
|
+
return
|
|
267
|
+
else:
|
|
268
|
+
assert refcount == 0
|
|
269
|
+
|
|
270
|
+
# No longer used
|
|
271
|
+
assert block.content_hash in self._cached_blocks
|
|
272
|
+
|
|
273
|
+
# Add the cached block to the evictor
|
|
274
|
+
# (This keeps the cached block around so it can be reused)
|
|
275
|
+
self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
|
|
276
|
+
self._block_tracker[block_id].last_accessed)
|
|
277
|
+
|
|
278
|
+
# Stop tracking the block
|
|
279
|
+
self._untrack_block_id(block_id)
|
|
280
|
+
|
|
281
|
+
block.block_id = None
|
|
282
|
+
|
|
283
|
+
def _decr_refcount_hashless_block(self, block: Block) -> None:
|
|
284
|
+
block_id = block.block_id
|
|
285
|
+
assert block_id is not None
|
|
286
|
+
|
|
287
|
+
# We may have a fork case where block is shared,
|
|
288
|
+
# in which case, we cannot remove it from tracking
|
|
289
|
+
refcount = self._refcounter.get(block_id)
|
|
290
|
+
if refcount == 1:
|
|
291
|
+
self._untrack_block_id(block_id)
|
|
292
|
+
|
|
293
|
+
# Decrement refcount of the block_id, but do not free the block object
|
|
294
|
+
# itself (will be handled by the caller)
|
|
295
|
+
self._hashless_allocator.free(block, keep_block_object=True)
|
|
296
|
+
|
|
297
|
+
def _allocate_block_id(self) -> BlockId:
|
|
298
|
+
"""First tries to allocate a block id from the hashless allocator,
|
|
299
|
+
and if there are no blocks, then tries to evict an unused cached block.
|
|
300
|
+
"""
|
|
301
|
+
hashless_block_id = self._maybe_allocate_hashless_block_id()
|
|
302
|
+
if hashless_block_id is not None:
|
|
303
|
+
return hashless_block_id
|
|
304
|
+
|
|
305
|
+
evicted_block_id = self._maybe_allocate_evicted_block_id()
|
|
306
|
+
if evicted_block_id is not None:
|
|
307
|
+
return evicted_block_id
|
|
308
|
+
|
|
309
|
+
# No block available in hashless allocator, nor in unused cache blocks.
|
|
310
|
+
raise BlockAllocator.NoFreeBlocksError()
|
|
311
|
+
|
|
312
|
+
def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
|
|
313
|
+
try:
|
|
314
|
+
# Allocate mutable block and extract its block_id
|
|
315
|
+
block = self._hashless_allocator.allocate_mutable_block(
|
|
316
|
+
prev_block=None)
|
|
317
|
+
block_id = block.block_id
|
|
318
|
+
self._block_pool.free_block(block)
|
|
319
|
+
|
|
320
|
+
self._track_block_id(block_id, computed=False)
|
|
321
|
+
return block_id
|
|
322
|
+
except BlockAllocator.NoFreeBlocksError:
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
|
|
326
|
+
if self.evictor.num_blocks == 0:
|
|
327
|
+
return None
|
|
328
|
+
|
|
329
|
+
# Here we get an evicted block, which is only added
|
|
330
|
+
# into evictor if its ref counter is 0
|
|
331
|
+
# and since its content would be changed, we need
|
|
332
|
+
# to remove it from _cached_blocks's tracking list
|
|
333
|
+
block_id, content_hash_to_evict = self.evictor.evict()
|
|
334
|
+
|
|
335
|
+
# Sanity checks
|
|
336
|
+
assert content_hash_to_evict in self._cached_blocks
|
|
337
|
+
_block_id = self._cached_blocks[content_hash_to_evict]
|
|
338
|
+
assert self._refcounter.get(_block_id) == 0
|
|
339
|
+
assert _block_id == block_id
|
|
340
|
+
|
|
341
|
+
self._cached_blocks.pop(content_hash_to_evict)
|
|
342
|
+
|
|
343
|
+
self._refcounter.incr(block_id)
|
|
344
|
+
self._track_block_id(block_id, computed=False)
|
|
345
|
+
|
|
346
|
+
return block_id
|
|
347
|
+
|
|
348
|
+
def _free_block_id(self, block: Block) -> None:
|
|
349
|
+
"""Decrements the refcount of the block. The block may be in two
|
|
350
|
+
possible states: (1) immutable/cached or (2) mutable/hashless.
|
|
351
|
+
In the first case, the refcount is decremented directly and the block
|
|
352
|
+
may be possibly added to the evictor. In other case, hashless
|
|
353
|
+
allocator free(..) with keep_block_object=True is called to only free
|
|
354
|
+
the block id (since the block object may be reused by the caller)
|
|
355
|
+
"""
|
|
356
|
+
block_id = block.block_id
|
|
357
|
+
assert block_id is not None, "Freeing unallocated block is undefined"
|
|
358
|
+
|
|
359
|
+
if block.content_hash is not None:
|
|
360
|
+
# Immutable: This type of block is always cached, and we want to
|
|
361
|
+
# keep it in the evictor for future reuse
|
|
362
|
+
self._decr_refcount_cached_block(block)
|
|
363
|
+
else:
|
|
364
|
+
# Mutable: This type of block is not cached, so we release it
|
|
365
|
+
# directly to the hashless allocator
|
|
366
|
+
self._decr_refcount_hashless_block(block)
|
|
367
|
+
|
|
368
|
+
assert block.block_id is None
|
|
369
|
+
|
|
370
|
+
def free(self, block: Block, keep_block_object: bool = False) -> None:
|
|
371
|
+
"""Release the block (look at free_block_id(..) docs)
|
|
372
|
+
"""
|
|
373
|
+
# Release the physical block index
|
|
374
|
+
self._free_block_id(block)
|
|
375
|
+
|
|
376
|
+
# Release the block object to the pool
|
|
377
|
+
if not keep_block_object:
|
|
378
|
+
self._block_pool.free_block(block)
|
|
379
|
+
|
|
380
|
+
def fork(self, last_block: Block) -> List[Block]:
|
|
381
|
+
"""Creates a new sequence of blocks that shares the same underlying
|
|
382
|
+
memory as the original sequence.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
last_block (Block): The last block in the original sequence.
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
List[Block]: The new sequence of blocks that shares the same memory
|
|
389
|
+
as the original sequence.
|
|
390
|
+
"""
|
|
391
|
+
source_blocks = get_all_blocks_recursively(last_block)
|
|
392
|
+
|
|
393
|
+
forked_blocks: List[Block] = []
|
|
394
|
+
prev_block = None
|
|
395
|
+
for block in source_blocks:
|
|
396
|
+
block_id = block.block_id
|
|
397
|
+
assert block_id is not None
|
|
398
|
+
|
|
399
|
+
refcount = self._refcounter.incr(block_id)
|
|
400
|
+
assert refcount != 1, "can't fork free'd block_id = {}".format(
|
|
401
|
+
block_id)
|
|
402
|
+
|
|
403
|
+
forked_block = self._block_pool.init_block(
|
|
404
|
+
prev_block=prev_block,
|
|
405
|
+
token_ids=block.token_ids,
|
|
406
|
+
block_size=self._block_size,
|
|
407
|
+
physical_block_id=block_id,
|
|
408
|
+
extra_hash=block.extra_hash)
|
|
409
|
+
|
|
410
|
+
forked_blocks.append(forked_block)
|
|
411
|
+
prev_block = forked_blocks[-1]
|
|
412
|
+
|
|
413
|
+
return forked_blocks
|
|
414
|
+
|
|
415
|
+
def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
|
|
416
|
+
assert device is None
|
|
417
|
+
# The number of free blocks is the number of hashless free blocks
|
|
418
|
+
# plus the number of blocks evictor could free from its list.
|
|
419
|
+
return self._hashless_allocator.get_num_free_blocks(
|
|
420
|
+
) + self.evictor.num_blocks
|
|
421
|
+
|
|
422
|
+
def get_num_total_blocks(self) -> int:
|
|
423
|
+
return self._hashless_allocator.get_num_total_blocks()
|
|
424
|
+
|
|
425
|
+
def get_physical_block_id(self, absolute_id: int) -> int:
|
|
426
|
+
"""Returns the zero-offset block id on certain block allocator
|
|
427
|
+
given the absolute block id.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
absolute_id (int): The absolute block id for the block
|
|
431
|
+
in whole allocator.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
int: The rzero-offset block id on certain device.
|
|
435
|
+
"""
|
|
436
|
+
return sorted(self.all_block_ids).index(absolute_id)
|
|
437
|
+
|
|
438
|
+
@property
|
|
439
|
+
def all_block_ids(self) -> FrozenSet[int]:
|
|
440
|
+
return self._hashless_allocator.all_block_ids
|
|
441
|
+
|
|
442
|
+
def get_prefix_cache_hit_rate(self) -> float:
|
|
443
|
+
return self.metric_data.get_hit_rate()
|
|
444
|
+
|
|
445
|
+
def reset_prefix_cache(self) -> bool:
|
|
446
|
+
"""Reset prefix cache. This function may be used in RLHF
|
|
447
|
+
flows to invalid prefix caching after the weights are updated,
|
|
448
|
+
or used for resetting prefix caching status for benchmarking.
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
bool: True if the prefix cache is successfully reset,
|
|
452
|
+
False otherwise.
|
|
453
|
+
"""
|
|
454
|
+
num_used_blocks = (self.get_num_total_blocks() -
|
|
455
|
+
self.get_num_free_blocks())
|
|
456
|
+
if num_used_blocks > 0:
|
|
457
|
+
logger.warning(
|
|
458
|
+
"Failed to reset prefix cache because some "
|
|
459
|
+
"blocks (%d) are not freed yet", num_used_blocks)
|
|
460
|
+
return False
|
|
461
|
+
|
|
462
|
+
# Free all blocks in the evictor.
|
|
463
|
+
while (block_id :=
|
|
464
|
+
self._maybe_allocate_evicted_block_id()) is not None:
|
|
465
|
+
self._hashless_allocator.free_block_id(block_id)
|
|
466
|
+
|
|
467
|
+
# Should not have any cached blocks because all blocks are evicted.
|
|
468
|
+
assert not self._cached_blocks
|
|
469
|
+
|
|
470
|
+
# Reset the evictor.
|
|
471
|
+
self.evictor = make_evictor(self.eviction_policy)
|
|
472
|
+
|
|
473
|
+
# Reset the block tracker.
|
|
474
|
+
for block_id in self._block_tracker:
|
|
475
|
+
self._block_tracker[block_id] = BlockTracker()
|
|
476
|
+
|
|
477
|
+
# Reset the metrics.
|
|
478
|
+
self.metric_data = CacheMetricData()
|
|
479
|
+
|
|
480
|
+
logger.info("Successfully reset prefix cache")
|
|
481
|
+
return True
|
|
482
|
+
|
|
483
|
+
def is_block_cached(self, block: Block) -> bool:
|
|
484
|
+
assert block.content_hash is not None
|
|
485
|
+
return block.content_hash in self._cached_blocks
|
|
486
|
+
|
|
487
|
+
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
|
488
|
+
"""Once a mutable block is full, it can be promoted to an immutable
|
|
489
|
+
block. This means that its content can be referenced by future blocks
|
|
490
|
+
having the same prefix.
|
|
491
|
+
|
|
492
|
+
Note that if we already have a cached block with the same content, we
|
|
493
|
+
will replace the newly-promoted block's mapping with the existing cached
|
|
494
|
+
block id.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
block: The mutable block to be promoted.
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
BlockId: Either the original block index, or the block index of
|
|
501
|
+
the previously cached block matching the same content.
|
|
502
|
+
"""
|
|
503
|
+
# Ensure block can be promoted
|
|
504
|
+
assert block.content_hash is not None
|
|
505
|
+
assert block.block_id is not None
|
|
506
|
+
assert self._refcounter.get(block.block_id) > 0
|
|
507
|
+
|
|
508
|
+
if block.content_hash not in self._cached_blocks:
|
|
509
|
+
# No cached content hash => Set this block as cached.
|
|
510
|
+
# Note that this block cannot be marked as computed yet
|
|
511
|
+
# because other sequences in the same batch cannot reuse
|
|
512
|
+
# this block.
|
|
513
|
+
self._cached_blocks[block.content_hash] = block.block_id
|
|
514
|
+
# Mark this block as touched so that it can be marked as
|
|
515
|
+
# computed after the entire batch of sequences are scheduled.
|
|
516
|
+
self._touched_blocks.add(block.block_id)
|
|
517
|
+
return block.block_id
|
|
518
|
+
|
|
519
|
+
# Reuse the cached content hash
|
|
520
|
+
self._decr_refcount_hashless_block(block)
|
|
521
|
+
block.block_id = self._cached_blocks[block.content_hash]
|
|
522
|
+
|
|
523
|
+
# Increment refcount of the cached block and (possibly) restore
|
|
524
|
+
# it from the evictor.
|
|
525
|
+
# Note that in this case, the block is marked as computed
|
|
526
|
+
self._incr_refcount_cached_block(block)
|
|
527
|
+
|
|
528
|
+
return block.block_id
|
|
529
|
+
|
|
530
|
+
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
|
531
|
+
"""Performs a copy-on-write operation on the given block if it is not
|
|
532
|
+
appendable.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
block (Block): The block to check for copy-on-write.
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
BlockId: The block index of the new block if a copy-on-write
|
|
539
|
+
operation was performed, or the original block index if
|
|
540
|
+
no copy-on-write was necessary.
|
|
541
|
+
"""
|
|
542
|
+
src_block_id = block.block_id
|
|
543
|
+
assert src_block_id is not None
|
|
544
|
+
|
|
545
|
+
if self._cow_tracker.is_appendable(block):
|
|
546
|
+
return src_block_id
|
|
547
|
+
|
|
548
|
+
self._free_block_id(block)
|
|
549
|
+
trg_block_id = self._allocate_block_id()
|
|
550
|
+
|
|
551
|
+
self._cow_tracker.record_cow(src_block_id, trg_block_id)
|
|
552
|
+
|
|
553
|
+
return trg_block_id
|
|
554
|
+
|
|
555
|
+
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
|
556
|
+
"""Returns the copy-on-write source->destination mapping and clears it.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
List[Tuple[BlockId, BlockId]]: A list mapping source
|
|
560
|
+
block indices to destination block indices.
|
|
561
|
+
"""
|
|
562
|
+
return self._cow_tracker.clear_cows()
|
|
563
|
+
|
|
564
|
+
def mark_blocks_as_accessed(self, block_ids: List[int],
|
|
565
|
+
now: float) -> None:
|
|
566
|
+
"""Mark blocks as accessed, used in prefix caching.
|
|
567
|
+
|
|
568
|
+
If the block is added into evictor, we need to update corresponding
|
|
569
|
+
info in evictor's metadata.
|
|
570
|
+
"""
|
|
571
|
+
|
|
572
|
+
for block_id in block_ids:
|
|
573
|
+
if self._block_tracker[block_id].active:
|
|
574
|
+
self._block_tracker[block_id].last_accessed = now
|
|
575
|
+
elif block_id in self.evictor:
|
|
576
|
+
self.evictor.update(block_id, now)
|
|
577
|
+
else:
|
|
578
|
+
raise ValueError(
|
|
579
|
+
"Mark block as accessed which is not belonged to GPU")
|
|
580
|
+
|
|
581
|
+
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
|
582
|
+
# Mark all touched blocks as computed.
|
|
583
|
+
for block_id in self._touched_blocks:
|
|
584
|
+
self._block_tracker[block_id].computed = True
|
|
585
|
+
self._touched_blocks.clear()
|
|
586
|
+
|
|
587
|
+
def _track_block_id(self, block_id: Optional[BlockId],
|
|
588
|
+
computed: bool) -> None:
|
|
589
|
+
assert block_id is not None
|
|
590
|
+
self._block_tracker[block_id].enable()
|
|
591
|
+
self._block_tracker[block_id].computed = computed
|
|
592
|
+
|
|
593
|
+
def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
|
|
594
|
+
assert block_id is not None
|
|
595
|
+
self._block_tracker[block_id].disable()
|
|
596
|
+
|
|
597
|
+
def block_is_computed(self, block_id: int) -> bool:
|
|
598
|
+
if self._block_tracker[block_id].active:
|
|
599
|
+
return self._block_tracker[block_id].computed
|
|
600
|
+
else:
|
|
601
|
+
return block_id in self.evictor
|
|
602
|
+
|
|
603
|
+
def get_common_computed_block_ids(
|
|
604
|
+
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
|
605
|
+
"""Return the block ids that are common for a given sequence group.
|
|
606
|
+
|
|
607
|
+
Only those blocks that are immutable and already be marked
|
|
608
|
+
compyted would be taken consideration.
|
|
609
|
+
"""
|
|
610
|
+
|
|
611
|
+
# NOTE We exclude the last block to avoid the case where the entire
|
|
612
|
+
# prompt is cached. This would cause erroneous behavior in model
|
|
613
|
+
# runner.
|
|
614
|
+
|
|
615
|
+
# It returns a list of int although type annotation says list of string.
|
|
616
|
+
if len(computed_seq_block_ids) == 1:
|
|
617
|
+
return computed_seq_block_ids[0]
|
|
618
|
+
|
|
619
|
+
return commonprefix([
|
|
620
|
+
ids for ids in computed_seq_block_ids # type: ignore
|
|
621
|
+
if ids
|
|
622
|
+
])
|
|
623
|
+
|
|
624
|
+
def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
|
|
625
|
+
"""Returns the number of full blocks that will be touched by
|
|
626
|
+
swapping in/out.
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
blocks: List of blocks to be swapped.
|
|
630
|
+
Returns:
|
|
631
|
+
int: the number of full blocks that will be touched by
|
|
632
|
+
swapping in/out the given blocks. Non full blocks are ignored
|
|
633
|
+
when deciding the number of blocks to touch.
|
|
634
|
+
"""
|
|
635
|
+
num_touched_blocks: int = 0
|
|
636
|
+
for block in blocks:
|
|
637
|
+
# If the block has a match in the cache and the cached
|
|
638
|
+
# block is not referenced, then we still count it as a
|
|
639
|
+
# touched block
|
|
640
|
+
if block.is_full and (not self.is_block_cached(block) or \
|
|
641
|
+
(block.content_hash is not None and \
|
|
642
|
+
self._cached_blocks[block.content_hash] in \
|
|
643
|
+
self.evictor)):
|
|
644
|
+
num_touched_blocks += 1
|
|
645
|
+
return num_touched_blocks
|
|
646
|
+
|
|
647
|
+
def swap_out(self, blocks: List[Block]) -> None:
|
|
648
|
+
"""Execute the swap out actions. Basically just free the
|
|
649
|
+
given blocks.
|
|
650
|
+
|
|
651
|
+
Args:
|
|
652
|
+
blocks: List of blocks to be swapped out.
|
|
653
|
+
"""
|
|
654
|
+
for block in blocks:
|
|
655
|
+
self._free_block_id(block)
|
|
656
|
+
|
|
657
|
+
def swap_in(self, blocks: List[Block]) -> None:
|
|
658
|
+
"""Execute the swap in actions. Change the block id from
|
|
659
|
+
old allocator to current allocator for each block to finish
|
|
660
|
+
the block table update.
|
|
661
|
+
|
|
662
|
+
Args:
|
|
663
|
+
blocks: List of blocks to be swapped in.
|
|
664
|
+
"""
|
|
665
|
+
for block in blocks:
|
|
666
|
+
# Here we allocate either immutable or mutable block and then
|
|
667
|
+
# extract its block_id. Note that the block object is released
|
|
668
|
+
# and the block_id is assigned to "block" to allow reusing the
|
|
669
|
+
# existing "block" object
|
|
670
|
+
if block.is_full:
|
|
671
|
+
tmp_block = self.allocate_immutable_block(
|
|
672
|
+
prev_block=block.prev_block,
|
|
673
|
+
token_ids=block.token_ids,
|
|
674
|
+
extra_hash=block.extra_hash)
|
|
675
|
+
else:
|
|
676
|
+
tmp_block = self.allocate_mutable_block(
|
|
677
|
+
prev_block=block.prev_block, extra_hash=block.extra_hash)
|
|
678
|
+
tmp_block.append_token_ids(block.token_ids)
|
|
679
|
+
|
|
680
|
+
block_id = tmp_block.block_id
|
|
681
|
+
self._block_pool.free_block(tmp_block)
|
|
682
|
+
|
|
683
|
+
block.block_id = block_id # Assign block_id
|
|
684
|
+
|
|
685
|
+
def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
|
|
686
|
+
"""
|
|
687
|
+
Given a list of block hashes, return the prefix of the block hashes that
|
|
688
|
+
are all cached.
|
|
689
|
+
|
|
690
|
+
Since a block's block hash includes the hashes of all previous blocks,
|
|
691
|
+
and we only allocate/deallocate blocks in the entire sequence, so if a
|
|
692
|
+
block is cached, then all previous blocks are also cached. With this
|
|
693
|
+
property, we can use binary search to find the prefix of cached blocks.
|
|
694
|
+
|
|
695
|
+
Args:
|
|
696
|
+
block_hashes (List[int]): The list of block hashes.
|
|
697
|
+
|
|
698
|
+
Returns:
|
|
699
|
+
List[int]: The prefix of the `block_hashes` that are cached.
|
|
700
|
+
"""
|
|
701
|
+
|
|
702
|
+
def _block_is_cached(block_hash: PrefixHash) -> bool:
|
|
703
|
+
if block_hash not in self._cached_blocks:
|
|
704
|
+
return False
|
|
705
|
+
|
|
706
|
+
cached_block_id = self._cached_blocks[block_hash]
|
|
707
|
+
# We only consider the blocks that are marked as computed.
|
|
708
|
+
return self.block_is_computed(cached_block_id)
|
|
709
|
+
|
|
710
|
+
def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int:
|
|
711
|
+
|
|
712
|
+
# python <= 3.10 don't have the key argument
|
|
713
|
+
if sys.version_info < (3, 10):
|
|
714
|
+
a = [key(e) for e in a]
|
|
715
|
+
return bisect_left(a, x)
|
|
716
|
+
else:
|
|
717
|
+
return bisect_left(a, x, key=key)
|
|
718
|
+
|
|
719
|
+
# Look for the first block that's not cached, and returns the prefix
|
|
720
|
+
# i.e. blocks that are cached.
|
|
721
|
+
idx = _bisect_left(block_hashes,
|
|
722
|
+
True,
|
|
723
|
+
key=lambda x: not _block_is_cached(x))
|
|
724
|
+
return block_hashes[:idx]
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
class PrefixCachingBlock(Block):
|
|
728
|
+
"""A block implementation that supports prefix caching.
|
|
729
|
+
|
|
730
|
+
The PrefixCachingBlock class represents a block of token IDs with prefix
|
|
731
|
+
caching capabilities. It wraps a NaiveBlock internally and provides
|
|
732
|
+
additional functionality for content hashing and promoting immutable blocks
|
|
733
|
+
with the prefix caching allocator.
|
|
734
|
+
|
|
735
|
+
Args:
|
|
736
|
+
prev_block (Optional[PrefixCachingBlock]): The previous block in the
|
|
737
|
+
sequence.
|
|
738
|
+
token_ids (List[int]): The initial token IDs to be stored in the block.
|
|
739
|
+
block_size (int): The maximum number of token IDs that can be stored in
|
|
740
|
+
the block.
|
|
741
|
+
allocator (BlockAllocator): The prefix
|
|
742
|
+
caching block allocator associated with this block.
|
|
743
|
+
block_id (Optional[int], optional): The physical block index
|
|
744
|
+
of this block. Defaults to None.
|
|
745
|
+
extra_hash (Optional[int]): The hash value of additional factors
|
|
746
|
+
such as adapters that influence the block, apart from the token_ids.
|
|
747
|
+
"""
|
|
748
|
+
|
|
749
|
+
# Note that we use 'None' as a string here instead of None because
|
|
750
|
+
# as of Python 3.12, hash(None) returns a constant predictable value.
|
|
751
|
+
# This could possibly make it easier to find and exploit hash
|
|
752
|
+
# collisions. 'None' as a string will be hashed differently per process,
|
|
753
|
+
# but consistently within the same process. This is the same as the
|
|
754
|
+
# behavior of None prior to Python 3.12.
|
|
755
|
+
_none_hash: int = hash('None')
|
|
756
|
+
|
|
757
|
+
def __init__(
|
|
758
|
+
self,
|
|
759
|
+
prev_block: Optional[Block],
|
|
760
|
+
token_ids: List[int],
|
|
761
|
+
block_size: int,
|
|
762
|
+
allocator: BlockAllocator,
|
|
763
|
+
block_id: Optional[int] = None,
|
|
764
|
+
computed: bool = False,
|
|
765
|
+
extra_hash: Optional[int] = None,
|
|
766
|
+
):
|
|
767
|
+
assert isinstance(allocator, PrefixCachingBlockAllocator), (
|
|
768
|
+
"Currently this class is only tested with "
|
|
769
|
+
"PrefixCachingBlockAllocator. Got instead allocator = {}".format(
|
|
770
|
+
allocator))
|
|
771
|
+
assert_prefix_caching_block_or_none(prev_block)
|
|
772
|
+
|
|
773
|
+
self._prev_block = prev_block
|
|
774
|
+
self._cached_content_hash: Optional[int] = None
|
|
775
|
+
self._cached_num_tokens_total: int = 0
|
|
776
|
+
self._allocator = allocator
|
|
777
|
+
self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
|
|
778
|
+
self._computed = computed
|
|
779
|
+
self._extra_hash = extra_hash
|
|
780
|
+
|
|
781
|
+
# On the first time, we create the block object, and next we only
|
|
782
|
+
# reinitialize it
|
|
783
|
+
if hasattr(self, "_block"):
|
|
784
|
+
self._block.__init__( # type: ignore[has-type]
|
|
785
|
+
prev_block=prev_block,
|
|
786
|
+
token_ids=token_ids,
|
|
787
|
+
block_size=block_size,
|
|
788
|
+
block_id=block_id,
|
|
789
|
+
allocator=self._allocator)
|
|
790
|
+
else:
|
|
791
|
+
self._block = NaiveBlock(prev_block=prev_block,
|
|
792
|
+
token_ids=token_ids,
|
|
793
|
+
block_size=block_size,
|
|
794
|
+
block_id=block_id,
|
|
795
|
+
allocator=self._allocator)
|
|
796
|
+
|
|
797
|
+
self._update_num_tokens_total()
|
|
798
|
+
|
|
799
|
+
def _update_num_tokens_total(self):
|
|
800
|
+
"""Incrementally computes the number of tokens that there is
|
|
801
|
+
till the current block (included)
|
|
802
|
+
"""
|
|
803
|
+
res = 0
|
|
804
|
+
|
|
805
|
+
# Add all previous blocks
|
|
806
|
+
if self._prev_block is not None:
|
|
807
|
+
res += self._prev_block.num_tokens_total
|
|
808
|
+
|
|
809
|
+
# Add current block
|
|
810
|
+
res += len(self.token_ids)
|
|
811
|
+
|
|
812
|
+
self._cached_num_tokens_total = res
|
|
813
|
+
|
|
814
|
+
@property
|
|
815
|
+
def computed(self) -> bool:
|
|
816
|
+
return self._computed
|
|
817
|
+
|
|
818
|
+
@computed.setter
|
|
819
|
+
def computed(self, value) -> None:
|
|
820
|
+
self._computed = value
|
|
821
|
+
|
|
822
|
+
@property
|
|
823
|
+
def last_accessed(self) -> float:
|
|
824
|
+
return self._last_accessed
|
|
825
|
+
|
|
826
|
+
@last_accessed.setter
|
|
827
|
+
def last_accessed(self, last_accessed_ts: float):
|
|
828
|
+
self._last_accessed = last_accessed_ts
|
|
829
|
+
|
|
830
|
+
def append_token_ids(self, token_ids: List[int]) -> None:
|
|
831
|
+
"""Appends the given token IDs to the block and registers the block as
|
|
832
|
+
immutable if the block becomes full.
|
|
833
|
+
|
|
834
|
+
Args:
|
|
835
|
+
token_ids (List[int]): The token IDs to be appended to the block.
|
|
836
|
+
"""
|
|
837
|
+
# Ensure this is mutable block (not promoted)
|
|
838
|
+
assert self.content_hash is None
|
|
839
|
+
assert not self.computed
|
|
840
|
+
|
|
841
|
+
if len(token_ids) == 0:
|
|
842
|
+
return
|
|
843
|
+
|
|
844
|
+
# Ensure there are input tokens
|
|
845
|
+
assert token_ids, "Got token_ids = {}".format(token_ids)
|
|
846
|
+
|
|
847
|
+
# Naive block handles CoW.
|
|
848
|
+
self._block.append_token_ids(token_ids)
|
|
849
|
+
self._update_num_tokens_total()
|
|
850
|
+
|
|
851
|
+
# If the content hash is present, then the block can be made immutable.
|
|
852
|
+
# Register ourselves with the allocator, potentially replacing the
|
|
853
|
+
# physical block index.
|
|
854
|
+
if self.content_hash is not None:
|
|
855
|
+
self.block_id = self._allocator.promote_to_immutable_block(self)
|
|
856
|
+
|
|
857
|
+
@property
|
|
858
|
+
def block_id(self) -> Optional[int]:
|
|
859
|
+
return self._block.block_id
|
|
860
|
+
|
|
861
|
+
@block_id.setter
|
|
862
|
+
def block_id(self, value) -> None:
|
|
863
|
+
self._block.block_id = value
|
|
864
|
+
|
|
865
|
+
@property
|
|
866
|
+
def is_full(self) -> bool:
|
|
867
|
+
return self._block.is_full
|
|
868
|
+
|
|
869
|
+
@property
|
|
870
|
+
def num_empty_slots(self) -> int:
|
|
871
|
+
return self._block.num_empty_slots
|
|
872
|
+
|
|
873
|
+
@property
|
|
874
|
+
def num_tokens_total(self) -> int:
|
|
875
|
+
return self._cached_num_tokens_total
|
|
876
|
+
|
|
877
|
+
@property
|
|
878
|
+
def block_size(self) -> int:
|
|
879
|
+
return self._block.block_size
|
|
880
|
+
|
|
881
|
+
@property
|
|
882
|
+
def token_ids(self) -> List[int]:
|
|
883
|
+
return self._block.token_ids
|
|
884
|
+
|
|
885
|
+
@property
|
|
886
|
+
def prev_block(self) -> Optional[Block]:
|
|
887
|
+
return self._prev_block
|
|
888
|
+
|
|
889
|
+
@property
|
|
890
|
+
def extra_hash(self) -> Optional[int]:
|
|
891
|
+
return self._extra_hash
|
|
892
|
+
|
|
893
|
+
@property
|
|
894
|
+
def content_hash(self) -> Optional[int]:
|
|
895
|
+
"""Return the content-based hash of the current block, or None if it is
|
|
896
|
+
not yet defined.
|
|
897
|
+
|
|
898
|
+
For the content-based hash to be defined, the current block must be
|
|
899
|
+
full.
|
|
900
|
+
"""
|
|
901
|
+
# If the hash is already computed, return it.
|
|
902
|
+
if self._cached_content_hash is not None:
|
|
903
|
+
return self._cached_content_hash
|
|
904
|
+
|
|
905
|
+
# We cannot compute a hash for the current block because it is not full.
|
|
906
|
+
if not self.is_full:
|
|
907
|
+
return None
|
|
908
|
+
|
|
909
|
+
is_first_block = self._prev_block is None
|
|
910
|
+
prev_block_hash = (
|
|
911
|
+
self._none_hash if is_first_block else
|
|
912
|
+
self._prev_block.content_hash # type: ignore
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
# Previous block exists but does not yet have a hash.
|
|
916
|
+
# Return no hash in this case.
|
|
917
|
+
if prev_block_hash == self._none_hash and not is_first_block:
|
|
918
|
+
return None
|
|
919
|
+
|
|
920
|
+
self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
|
|
921
|
+
is_first_block,
|
|
922
|
+
prev_block_hash,
|
|
923
|
+
cur_block_token_ids=self.token_ids,
|
|
924
|
+
extra_hash=self._extra_hash)
|
|
925
|
+
return self._cached_content_hash
|
|
926
|
+
|
|
927
|
+
@classmethod
|
|
928
|
+
def hash_block_tokens(cls,
|
|
929
|
+
is_first_block: bool,
|
|
930
|
+
prev_block_hash: Optional[int],
|
|
931
|
+
cur_block_token_ids: List[int],
|
|
932
|
+
extra_hash: Optional[int] = None) -> int:
|
|
933
|
+
"""Computes a hash value corresponding to the contents of a block and
|
|
934
|
+
the contents of the preceding block(s). The hash value is used for
|
|
935
|
+
prefix caching.
|
|
936
|
+
|
|
937
|
+
Parameters:
|
|
938
|
+
- is_first_block (bool): A flag indicating if the block is the first in
|
|
939
|
+
the sequence.
|
|
940
|
+
- prev_block_hash (Optional[int]): The hash of the previous block. None
|
|
941
|
+
if this is the first block.
|
|
942
|
+
- cur_block_token_ids (List[int]): A list of token ids in the current
|
|
943
|
+
block. The current block is assumed to be full.
|
|
944
|
+
- extra_hash (Optional[int]): The hash value of additional factors
|
|
945
|
+
such as adapters that influence the block, apart from the token_ids.
|
|
946
|
+
|
|
947
|
+
Returns:
|
|
948
|
+
- int: The computed hash value for the block.
|
|
949
|
+
"""
|
|
950
|
+
if is_first_block and prev_block_hash is None:
|
|
951
|
+
prev_block_hash = cls._none_hash
|
|
952
|
+
return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
|
|
953
|
+
extra_hash))
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
class ComputedBlocksTracker:
|
|
957
|
+
"""
|
|
958
|
+
Tracks the computed blocks for each sequence.
|
|
959
|
+
|
|
960
|
+
Internally, it maintains a map from sequence id to the list of block hashes
|
|
961
|
+
for the sequence. We cache the hashes of the full blocks for each sequence,
|
|
962
|
+
and make sure the hash is calculated in the same way as the allocator.
|
|
963
|
+
When a sequence is being decoded, we also update the sequence's hash
|
|
964
|
+
accordingly and incrementally.
|
|
965
|
+
|
|
966
|
+
From the sequence hash, with prefix caching enabled, we could also calculate
|
|
967
|
+
the number of cached tokens for the sequence by looking up the number of
|
|
968
|
+
cached block hashes in the allocator.
|
|
969
|
+
"""
|
|
970
|
+
|
|
971
|
+
# Note that we use 'None' as a string here instead of None because
|
|
972
|
+
# as of Python 3.12, hash(None) returns a constant predictable value.
|
|
973
|
+
# This could possibly make it easier to find and exploit hash
|
|
974
|
+
# collisions. 'None' as a string will be hashed differently per process,
|
|
975
|
+
# but consistently within the same process. This is the same as the
|
|
976
|
+
# behavior of None prior to Python 3.12.
|
|
977
|
+
_none_hash: int = hash('None')
|
|
978
|
+
|
|
979
|
+
def __init__(
|
|
980
|
+
self,
|
|
981
|
+
allocator: DeviceAwareBlockAllocator,
|
|
982
|
+
block_size: int,
|
|
983
|
+
enable_caching: bool,
|
|
984
|
+
):
|
|
985
|
+
self._allocator = allocator
|
|
986
|
+
self._block_size = block_size
|
|
987
|
+
self._enable_caching = enable_caching
|
|
988
|
+
|
|
989
|
+
# A map from seq_id to the list of block hashes for the
|
|
990
|
+
# sequence. This is so that we don't have to recompute the block hashes
|
|
991
|
+
# for the sequence when we need to check if the sequence is cached.
|
|
992
|
+
# Note a block that's not full will not have its hash calculated and
|
|
993
|
+
# recorded.
|
|
994
|
+
self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {}
|
|
995
|
+
|
|
996
|
+
# A map from seq_id to the number of tokens that are cached for the
|
|
997
|
+
# sequence.
|
|
998
|
+
# We need this so that a sequence in continuous prefill doesn't
|
|
999
|
+
# accidentally see its cached token count change. See comments in
|
|
1000
|
+
# `get_num_cached_tokens` for more details.
|
|
1001
|
+
self._seq_id_to_num_tokens_computed: Dict[int, int] = {}
|
|
1002
|
+
|
|
1003
|
+
def _update_seq_hashes(self, seq: Sequence) -> None:
|
|
1004
|
+
"""Incrementally update the sequence's block hashes and record them."""
|
|
1005
|
+
assert self._enable_caching
|
|
1006
|
+
|
|
1007
|
+
block_hashes_recorded = self._seq_id_to_blocks_hashes.get(
|
|
1008
|
+
seq.seq_id, [])
|
|
1009
|
+
cur_num_blocks_recorded = len(block_hashes_recorded)
|
|
1010
|
+
token_ids = seq.get_token_ids()
|
|
1011
|
+
assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, (
|
|
1012
|
+
f"The sequence has {len(token_ids)} tokens, but"
|
|
1013
|
+
f" already recorded {cur_num_blocks_recorded} blocks. "
|
|
1014
|
+
"This should not happen since we assume blocks are "
|
|
1015
|
+
"only appended other than recomputation. When the sequence is "
|
|
1016
|
+
"recomputed, we should have removed the info of the old blocks.")
|
|
1017
|
+
# Update the computed block hashes for the sequence. Since only full
|
|
1018
|
+
# blocks are considered as "computed", we take floor here.
|
|
1019
|
+
num_computed_blocks = len(token_ids) // self._block_size
|
|
1020
|
+
|
|
1021
|
+
# We need to know the hash of the previous block to compute the hash of
|
|
1022
|
+
# the current block so that blocks could be uniquely identified across
|
|
1023
|
+
# sequences of prefixes.
|
|
1024
|
+
prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
|
|
1025
|
+
block_hashes_recorded[-1])
|
|
1026
|
+
# Only update the computed block hashes for the new blocks
|
|
1027
|
+
for i in range(cur_num_blocks_recorded, num_computed_blocks):
|
|
1028
|
+
assert len(token_ids) >= (i + 1) * self._block_size
|
|
1029
|
+
block_token_ids = token_ids[i * self._block_size:(i + 1) *
|
|
1030
|
+
self._block_size]
|
|
1031
|
+
|
|
1032
|
+
# NOTE: If there are any factors affecting the block besides
|
|
1033
|
+
# token_ids, they should be added as input to extra_hash.
|
|
1034
|
+
extra_hash = seq.extra_hash()
|
|
1035
|
+
|
|
1036
|
+
# This has to be kept in sync with the allocator's hash
|
|
1037
|
+
# calculation.
|
|
1038
|
+
block_hash = PrefixCachingBlock.hash_block_tokens(
|
|
1039
|
+
is_first_block=prev_block_hash == self._none_hash,
|
|
1040
|
+
prev_block_hash=prev_block_hash,
|
|
1041
|
+
cur_block_token_ids=block_token_ids,
|
|
1042
|
+
extra_hash=extra_hash,
|
|
1043
|
+
)
|
|
1044
|
+
block_hashes_recorded.append(block_hash)
|
|
1045
|
+
prev_block_hash = block_hash
|
|
1046
|
+
|
|
1047
|
+
self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded
|
|
1048
|
+
|
|
1049
|
+
def get_num_cached_tokens(self, seq: Sequence) -> int:
|
|
1050
|
+
if not self._enable_caching:
|
|
1051
|
+
return 0
|
|
1052
|
+
|
|
1053
|
+
# We always try to update the sequence hashes on the fly.
|
|
1054
|
+
# This is to ensure that we don't miss any cached tokens for the
|
|
1055
|
+
# sequence during decode.
|
|
1056
|
+
# This routine should only update hash for any new blocks too.
|
|
1057
|
+
self._update_seq_hashes(seq)
|
|
1058
|
+
|
|
1059
|
+
num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get(
|
|
1060
|
+
seq.seq_id, None)
|
|
1061
|
+
|
|
1062
|
+
# TODO(rickyx): This hack could be removed once we mark blocks as
|
|
1063
|
+
# computed correctly with chunked prefills.
|
|
1064
|
+
if num_computed_tokens_prev is not None and seq.is_prefill():
|
|
1065
|
+
# For a sequence that is still in prefill, we don't
|
|
1066
|
+
# recompute the number of cached tokens.
|
|
1067
|
+
# This also handles correctly chunked prefill since currently
|
|
1068
|
+
# we mark blocks as computed even if the sequence is still partially
|
|
1069
|
+
# prefilled. So a continuously prefilled sequence should not
|
|
1070
|
+
# see its cached token count change while running.
|
|
1071
|
+
return num_computed_tokens_prev
|
|
1072
|
+
|
|
1073
|
+
block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id]
|
|
1074
|
+
|
|
1075
|
+
# This is O(logN), where N is the number of blocks.
|
|
1076
|
+
num_cached_blocks = len(
|
|
1077
|
+
self._allocator.find_cached_blocks_prefix(block_hashes))
|
|
1078
|
+
num_cached_tokens = num_cached_blocks * self._block_size
|
|
1079
|
+
self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
|
|
1080
|
+
return num_cached_tokens
|
|
1081
|
+
|
|
1082
|
+
def remove_seq(self, seq_id: int) -> None:
|
|
1083
|
+
"""Stop tracking the sequence."""
|
|
1084
|
+
if not self._enable_caching:
|
|
1085
|
+
return
|
|
1086
|
+
assert seq_id in self._seq_id_to_blocks_hashes
|
|
1087
|
+
del self._seq_id_to_blocks_hashes[seq_id]
|
|
1088
|
+
|
|
1089
|
+
assert seq_id in self._seq_id_to_num_tokens_computed
|
|
1090
|
+
del self._seq_id_to_num_tokens_computed[seq_id]
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
class LastAccessBlocksTracker:
|
|
1094
|
+
"""Manages the last access time of the tracked sequences, in order to allow
|
|
1095
|
+
an efficient update of allocator's block last access times
|
|
1096
|
+
"""
|
|
1097
|
+
|
|
1098
|
+
def __init__(self, allocator):
|
|
1099
|
+
self._allocator = allocator
|
|
1100
|
+
self._seq_last_access: Dict[int, Optional[float]] = {}
|
|
1101
|
+
|
|
1102
|
+
def add_seq(self, seq_id: int) -> None:
|
|
1103
|
+
"""Start tracking seq_id
|
|
1104
|
+
"""
|
|
1105
|
+
assert seq_id not in self._seq_last_access
|
|
1106
|
+
self._seq_last_access[seq_id] = None
|
|
1107
|
+
|
|
1108
|
+
def remove_seq(self, seq_id: int) -> None:
|
|
1109
|
+
"""Stop tracking seq_id
|
|
1110
|
+
"""
|
|
1111
|
+
assert seq_id in self._seq_last_access
|
|
1112
|
+
del self._seq_last_access[seq_id]
|
|
1113
|
+
|
|
1114
|
+
def update_last_access(self, seq_id: int, time: float) -> None:
|
|
1115
|
+
assert seq_id in self._seq_last_access
|
|
1116
|
+
self._seq_last_access[seq_id] = time
|
|
1117
|
+
|
|
1118
|
+
def update_seq_blocks_last_access(self, seq_id: int,
|
|
1119
|
+
block_ids: List[int]) -> None:
|
|
1120
|
+
assert seq_id in self._seq_last_access
|
|
1121
|
+
|
|
1122
|
+
ts = self._seq_last_access[seq_id]
|
|
1123
|
+
|
|
1124
|
+
if ts is None:
|
|
1125
|
+
# No last access was recorded, no need to update.
|
|
1126
|
+
return
|
|
1127
|
+
|
|
1128
|
+
self._allocator.mark_blocks_as_accessed(block_ids, ts)
|
|
1129
|
+
|
|
1130
|
+
|
|
1131
|
+
def assert_prefix_caching_block_or_none(block: Optional[Block]):
|
|
1132
|
+
if block is None:
|
|
1133
|
+
return
|
|
1134
|
+
assert isinstance(block,
|
|
1135
|
+
PrefixCachingBlock), "Got block = {}".format(block)
|