vllm-npu 0.4.2__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- vllm_npu-0.4.2/CMakeLists.txt +294 -0
- vllm_npu-0.4.2/LICENSE +201 -0
- vllm_npu-0.4.2/MANIFEST.in +10 -0
- vllm_npu-0.4.2/PKG-INFO +173 -0
- vllm_npu-0.4.2/README.md +118 -0
- vllm_npu-0.4.2/cmake/cpu_extension.cmake +90 -0
- vllm_npu-0.4.2/cmake/hipify.py +73 -0
- vllm_npu-0.4.2/cmake/utils.cmake +354 -0
- vllm_npu-0.4.2/csrc/activation_kernels.cu +161 -0
- vllm_npu-0.4.2/csrc/attention/attention_dtypes.h +7 -0
- vllm_npu-0.4.2/csrc/attention/attention_generic.cuh +64 -0
- vllm_npu-0.4.2/csrc/attention/attention_kernels.cu +980 -0
- vllm_npu-0.4.2/csrc/attention/attention_utils.cuh +56 -0
- vllm_npu-0.4.2/csrc/attention/dtype_bfloat16.cuh +451 -0
- vllm_npu-0.4.2/csrc/attention/dtype_float16.cuh +502 -0
- vllm_npu-0.4.2/csrc/attention/dtype_float32.cuh +273 -0
- vllm_npu-0.4.2/csrc/attention/dtype_fp8.cuh +35 -0
- vllm_npu-0.4.2/csrc/cache.h +38 -0
- vllm_npu-0.4.2/csrc/cache_kernels.cu +419 -0
- vllm_npu-0.4.2/csrc/cpu/activation.cpp +148 -0
- vllm_npu-0.4.2/csrc/cpu/attention.cpp +746 -0
- vllm_npu-0.4.2/csrc/cpu/cache.cpp +141 -0
- vllm_npu-0.4.2/csrc/cpu/cpu_types.hpp +352 -0
- vllm_npu-0.4.2/csrc/cpu/layernorm.cpp +117 -0
- vllm_npu-0.4.2/csrc/cpu/pos_encoding.cpp +199 -0
- vllm_npu-0.4.2/csrc/cpu/pybind.cpp +73 -0
- vllm_npu-0.4.2/csrc/cuda_compat.h +38 -0
- vllm_npu-0.4.2/csrc/cuda_utils.h +10 -0
- vllm_npu-0.4.2/csrc/cuda_utils_kernels.cu +35 -0
- vllm_npu-0.4.2/csrc/custom_all_reduce.cu +148 -0
- vllm_npu-0.4.2/csrc/custom_all_reduce.cuh +485 -0
- vllm_npu-0.4.2/csrc/custom_all_reduce_test.cu +316 -0
- vllm_npu-0.4.2/csrc/dispatch_utils.h +37 -0
- vllm_npu-0.4.2/csrc/layernorm_kernels.cu +352 -0
- vllm_npu-0.4.2/csrc/moe/moe_ops.cpp +7 -0
- vllm_npu-0.4.2/csrc/moe/moe_ops.h +9 -0
- vllm_npu-0.4.2/csrc/moe/topk_softmax_kernels.cu +499 -0
- vllm_npu-0.4.2/csrc/moe_align_block_size_kernels.cu +125 -0
- vllm_npu-0.4.2/csrc/ops.h +206 -0
- vllm_npu-0.4.2/csrc/pos_encoding_kernels.cu +226 -0
- vllm_npu-0.4.2/csrc/punica/LICENSE +217 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu +5 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu +5 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_config.h +162 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu +5 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu +5 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu +5 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu +5 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_impl.cuh +297 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/generator.py +48 -0
- vllm_npu-0.4.2/csrc/punica/bgmv/vec_dtypes.cuh +1324 -0
- vllm_npu-0.4.2/csrc/punica/punica_ops.cc +582 -0
- vllm_npu-0.4.2/csrc/pybind.cpp +136 -0
- vllm_npu-0.4.2/csrc/quantization/aqlm/gemm_kernels.cu +712 -0
- vllm_npu-0.4.2/csrc/quantization/awq/dequantize.cuh +87 -0
- vllm_npu-0.4.2/csrc/quantization/awq/gemm_kernels.cu +446 -0
- vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/hip_float8.h +167 -0
- vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/hip_float8_impl.h +316 -0
- vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/quant_utils.cuh +517 -0
- vllm_npu-0.4.2/csrc/quantization/fp8/fp8_cuda_kernels.cu +126 -0
- vllm_npu-0.4.2/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh +277 -0
- vllm_npu-0.4.2/csrc/quantization/gptq/compat.cuh +64 -0
- vllm_npu-0.4.2/csrc/quantization/gptq/matrix_view.cuh +274 -0
- vllm_npu-0.4.2/csrc/quantization/gptq/q_gemm.cu +2075 -0
- vllm_npu-0.4.2/csrc/quantization/gptq/qdq_2.cuh +87 -0
- vllm_npu-0.4.2/csrc/quantization/gptq/qdq_3.cuh +141 -0
- vllm_npu-0.4.2/csrc/quantization/gptq/qdq_4.cuh +147 -0
- vllm_npu-0.4.2/csrc/quantization/gptq/qdq_8.cuh +40 -0
- vllm_npu-0.4.2/csrc/quantization/gptq/qdq_util.cuh +60 -0
- vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin.cu +1722 -0
- vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin.cuh +70 -0
- vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu +352 -0
- vllm_npu-0.4.2/csrc/quantization/marlin/LICENSE +209 -0
- vllm_npu-0.4.2/csrc/quantization/marlin/marlin_cuda_kernel.cu +1138 -0
- vllm_npu-0.4.2/csrc/quantization/squeezellm/quant_cuda_kernel.cu +225 -0
- vllm_npu-0.4.2/csrc/reduction_utils.cuh +65 -0
- vllm_npu-0.4.2/pyproject.toml +67 -0
- vllm_npu-0.4.2/requirements-common.txt +20 -0
- vllm_npu-0.4.2/requirements-cpu.txt +6 -0
- vllm_npu-0.4.2/requirements-cuda.txt +9 -0
- vllm_npu-0.4.2/requirements-neuron.txt +7 -0
- vllm_npu-0.4.2/requirements-rocm.txt +5 -0
- vllm_npu-0.4.2/setup.cfg +4 -0
- vllm_npu-0.4.2/setup.py +299 -0
- vllm_npu-0.4.2/tests/test_cache_block_hashing.py +93 -0
- vllm_npu-0.4.2/tests/test_config.py +39 -0
- vllm_npu-0.4.2/tests/test_logger.py +214 -0
- vllm_npu-0.4.2/tests/test_logits_processor.py +103 -0
- vllm_npu-0.4.2/tests/test_regression.py +58 -0
- vllm_npu-0.4.2/tests/test_sampling_params.py +13 -0
- vllm_npu-0.4.2/tests/test_sequence.py +124 -0
- vllm_npu-0.4.2/vllm/__init__.py +23 -0
- vllm_npu-0.4.2/vllm/_custom_ops.py +251 -0
- vllm_npu-0.4.2/vllm/attention/__init__.py +13 -0
- vllm_npu-0.4.2/vllm/attention/backends/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/attention/backends/abstract.py +127 -0
- vllm_npu-0.4.2/vllm/attention/backends/flash_attn.py +271 -0
- vllm_npu-0.4.2/vllm/attention/backends/flashinfer.py +220 -0
- vllm_npu-0.4.2/vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm_npu-0.4.2/vllm/attention/backends/torch_sdpa.py +250 -0
- vllm_npu-0.4.2/vllm/attention/backends/xformers.py +393 -0
- vllm_npu-0.4.2/vllm/attention/layer.py +56 -0
- vllm_npu-0.4.2/vllm/attention/ops/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/attention/ops/paged_attn.py +216 -0
- vllm_npu-0.4.2/vllm/attention/ops/prefix_prefill.py +792 -0
- vllm_npu-0.4.2/vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm_npu-0.4.2/vllm/attention/selector.py +91 -0
- vllm_npu-0.4.2/vllm/block.py +84 -0
- vllm_npu-0.4.2/vllm/config.py +1225 -0
- vllm_npu-0.4.2/vllm/core/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/core/block/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/core/block/block_table.py +295 -0
- vllm_npu-0.4.2/vllm/core/block/common.py +199 -0
- vllm_npu-0.4.2/vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm_npu-0.4.2/vllm/core/block/interfaces.py +205 -0
- vllm_npu-0.4.2/vllm/core/block/naive_block.py +318 -0
- vllm_npu-0.4.2/vllm/core/block/prefix_caching_block.py +606 -0
- vllm_npu-0.4.2/vllm/core/block_manager_v1.py +625 -0
- vllm_npu-0.4.2/vllm/core/block_manager_v2.py +258 -0
- vllm_npu-0.4.2/vllm/core/evictor_v1.py +105 -0
- vllm_npu-0.4.2/vllm/core/evictor_v2.py +127 -0
- vllm_npu-0.4.2/vllm/core/interfaces.py +113 -0
- vllm_npu-0.4.2/vllm/core/policy.py +45 -0
- vllm_npu-0.4.2/vllm/core/scheduler.py +1163 -0
- vllm_npu-0.4.2/vllm/distributed/__init__.py +3 -0
- vllm_npu-0.4.2/vllm/distributed/communication_op.py +237 -0
- vllm_npu-0.4.2/vllm/distributed/device_communicators/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm_npu-0.4.2/vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm_npu-0.4.2/vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm_npu-0.4.2/vllm/distributed/parallel_state.py +339 -0
- vllm_npu-0.4.2/vllm/distributed/utils.py +136 -0
- vllm_npu-0.4.2/vllm/engine/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/engine/arg_utils.py +649 -0
- vllm_npu-0.4.2/vllm/engine/async_llm_engine.py +737 -0
- vllm_npu-0.4.2/vllm/engine/llm_engine.py +784 -0
- vllm_npu-0.4.2/vllm/engine/metrics.py +368 -0
- vllm_npu-0.4.2/vllm/engine/output_processor/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/engine/output_processor/interfaces.py +76 -0
- vllm_npu-0.4.2/vllm/engine/output_processor/multi_step.py +142 -0
- vllm_npu-0.4.2/vllm/engine/output_processor/single_step.py +284 -0
- vllm_npu-0.4.2/vllm/engine/output_processor/stop_checker.py +101 -0
- vllm_npu-0.4.2/vllm/engine/output_processor/util.py +19 -0
- vllm_npu-0.4.2/vllm/entrypoints/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/entrypoints/api_server.py +119 -0
- vllm_npu-0.4.2/vllm/entrypoints/llm.py +259 -0
- vllm_npu-0.4.2/vllm/entrypoints/openai/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/entrypoints/openai/api_server.py +186 -0
- vllm_npu-0.4.2/vllm/entrypoints/openai/cli_args.py +115 -0
- vllm_npu-0.4.2/vllm/entrypoints/openai/protocol.py +460 -0
- vllm_npu-0.4.2/vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm_npu-0.4.2/vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm_npu-0.4.2/vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm_npu-0.4.2/vllm/envs.py +217 -0
- vllm_npu-0.4.2/vllm/executor/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/executor/cpu_executor.py +152 -0
- vllm_npu-0.4.2/vllm/executor/distributed_gpu_executor.py +115 -0
- vllm_npu-0.4.2/vllm/executor/executor_base.py +115 -0
- vllm_npu-0.4.2/vllm/executor/gpu_executor.py +150 -0
- vllm_npu-0.4.2/vllm/executor/multiproc_worker_utils.py +263 -0
- vllm_npu-0.4.2/vllm/executor/neuron_executor.py +91 -0
- vllm_npu-0.4.2/vllm/executor/ray_gpu_executor.py +327 -0
- vllm_npu-0.4.2/vllm/executor/ray_utils.py +119 -0
- vllm_npu-0.4.2/vllm/logger.py +153 -0
- vllm_npu-0.4.2/vllm/logging/__init__.py +5 -0
- vllm_npu-0.4.2/vllm/logging/formatter.py +15 -0
- vllm_npu-0.4.2/vllm/lora/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/lora/fully_sharded_layers.py +262 -0
- vllm_npu-0.4.2/vllm/lora/layers.py +1181 -0
- vllm_npu-0.4.2/vllm/lora/lora.py +167 -0
- vllm_npu-0.4.2/vllm/lora/models.py +645 -0
- vllm_npu-0.4.2/vllm/lora/punica.py +213 -0
- vllm_npu-0.4.2/vllm/lora/request.py +32 -0
- vllm_npu-0.4.2/vllm/lora/utils.py +98 -0
- vllm_npu-0.4.2/vllm/lora/worker_manager.py +251 -0
- vllm_npu-0.4.2/vllm/model_executor/__init__.py +7 -0
- vllm_npu-0.4.2/vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm_npu-0.4.2/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm_npu-0.4.2/vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm_npu-0.4.2/vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/activation.py +173 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/layernorm.py +71 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/linear.py +709 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/logits_processor.py +115 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/ops/rand.py +157 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/ops/sample.py +406 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/sampler.py +1051 -0
- vllm_npu-0.4.2/vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm_npu-0.4.2/vllm/model_executor/model_loader/__init__.py +30 -0
- vllm_npu-0.4.2/vllm/model_executor/model_loader/loader.py +362 -0
- vllm_npu-0.4.2/vllm/model_executor/model_loader/neuron.py +136 -0
- vllm_npu-0.4.2/vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm_npu-0.4.2/vllm/model_executor/model_loader/utils.py +41 -0
- vllm_npu-0.4.2/vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm_npu-0.4.2/vllm/model_executor/models/__init__.py +119 -0
- vllm_npu-0.4.2/vllm/model_executor/models/baichuan.py +410 -0
- vllm_npu-0.4.2/vllm/model_executor/models/bloom.py +327 -0
- vllm_npu-0.4.2/vllm/model_executor/models/chatglm.py +386 -0
- vllm_npu-0.4.2/vllm/model_executor/models/commandr.py +373 -0
- vllm_npu-0.4.2/vllm/model_executor/models/dbrx.py +413 -0
- vllm_npu-0.4.2/vllm/model_executor/models/decilm.py +122 -0
- vllm_npu-0.4.2/vllm/model_executor/models/deepseek.py +438 -0
- vllm_npu-0.4.2/vllm/model_executor/models/falcon.py +444 -0
- vllm_npu-0.4.2/vllm/model_executor/models/gemma.py +393 -0
- vllm_npu-0.4.2/vllm/model_executor/models/gpt2.py +266 -0
- vllm_npu-0.4.2/vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm_npu-0.4.2/vllm/model_executor/models/gpt_j.py +281 -0
- vllm_npu-0.4.2/vllm/model_executor/models/gpt_neox.py +295 -0
- vllm_npu-0.4.2/vllm/model_executor/models/internlm2.py +323 -0
- vllm_npu-0.4.2/vllm/model_executor/models/jais.py +333 -0
- vllm_npu-0.4.2/vllm/model_executor/models/llama.py +442 -0
- vllm_npu-0.4.2/vllm/model_executor/models/llava.py +239 -0
- vllm_npu-0.4.2/vllm/model_executor/models/minicpm.py +531 -0
- vllm_npu-0.4.2/vllm/model_executor/models/mixtral.py +583 -0
- vllm_npu-0.4.2/vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm_npu-0.4.2/vllm/model_executor/models/mpt.py +295 -0
- vllm_npu-0.4.2/vllm/model_executor/models/olmo.py +356 -0
- vllm_npu-0.4.2/vllm/model_executor/models/opt.py +349 -0
- vllm_npu-0.4.2/vllm/model_executor/models/orion.py +319 -0
- vllm_npu-0.4.2/vllm/model_executor/models/phi.py +300 -0
- vllm_npu-0.4.2/vllm/model_executor/models/qwen.py +284 -0
- vllm_npu-0.4.2/vllm/model_executor/models/qwen2.py +367 -0
- vllm_npu-0.4.2/vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm_npu-0.4.2/vllm/model_executor/models/stablelm.py +301 -0
- vllm_npu-0.4.2/vllm/model_executor/models/starcoder2.py +302 -0
- vllm_npu-0.4.2/vllm/model_executor/models/xverse.py +366 -0
- vllm_npu-0.4.2/vllm/model_executor/sampling_metadata.py +588 -0
- vllm_npu-0.4.2/vllm/model_executor/utils.py +35 -0
- vllm_npu-0.4.2/vllm/outputs.py +150 -0
- vllm_npu-0.4.2/vllm/py.typed +2 -0
- vllm_npu-0.4.2/vllm/sampling_params.py +340 -0
- vllm_npu-0.4.2/vllm/sequence.py +766 -0
- vllm_npu-0.4.2/vllm/spec_decode/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/spec_decode/batch_expansion.py +397 -0
- vllm_npu-0.4.2/vllm/spec_decode/interfaces.py +73 -0
- vllm_npu-0.4.2/vllm/spec_decode/metrics.py +191 -0
- vllm_npu-0.4.2/vllm/spec_decode/multi_step_worker.py +203 -0
- vllm_npu-0.4.2/vllm/spec_decode/ngram_worker.py +176 -0
- vllm_npu-0.4.2/vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm_npu-0.4.2/vllm/spec_decode/top1_proposer.py +200 -0
- vllm_npu-0.4.2/vllm/spec_decode/util.py +228 -0
- vllm_npu-0.4.2/vllm/test_utils.py +41 -0
- vllm_npu-0.4.2/vllm/transformers_utils/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/transformers_utils/config.py +58 -0
- vllm_npu-0.4.2/vllm/transformers_utils/configs/__init__.py +16 -0
- vllm_npu-0.4.2/vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm_npu-0.4.2/vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm_npu-0.4.2/vllm/transformers_utils/configs/falcon.py +87 -0
- vllm_npu-0.4.2/vllm/transformers_utils/configs/jais.py +236 -0
- vllm_npu-0.4.2/vllm/transformers_utils/configs/mpt.py +178 -0
- vllm_npu-0.4.2/vllm/transformers_utils/detokenizer.py +313 -0
- vllm_npu-0.4.2/vllm/transformers_utils/tokenizer.py +149 -0
- vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm_npu-0.4.2/vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm_npu-0.4.2/vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm_npu-0.4.2/vllm/usage/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/usage/usage_lib.py +209 -0
- vllm_npu-0.4.2/vllm/utils.py +677 -0
- vllm_npu-0.4.2/vllm/worker/__init__.py +0 -0
- vllm_npu-0.4.2/vllm/worker/cache_engine.py +105 -0
- vllm_npu-0.4.2/vllm/worker/cpu_model_runner.py +346 -0
- vllm_npu-0.4.2/vllm/worker/cpu_worker.py +321 -0
- vllm_npu-0.4.2/vllm/worker/model_runner.py +1168 -0
- vllm_npu-0.4.2/vllm/worker/neuron_model_runner.py +196 -0
- vllm_npu-0.4.2/vllm/worker/neuron_worker.py +98 -0
- vllm_npu-0.4.2/vllm/worker/worker.py +345 -0
- vllm_npu-0.4.2/vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2/vllm_npu.egg-info/PKG-INFO +173 -0
- vllm_npu-0.4.2/vllm_npu.egg-info/SOURCES.txt +308 -0
- vllm_npu-0.4.2/vllm_npu.egg-info/dependency_links.txt +1 -0
- vllm_npu-0.4.2/vllm_npu.egg-info/requires.txt +26 -0
- vllm_npu-0.4.2/vllm_npu.egg-info/top_level.txt +1 -0
@@ -0,0 +1,294 @@
|
|
1
|
+
cmake_minimum_required(VERSION 3.21)
|
2
|
+
|
3
|
+
project(vllm_extensions LANGUAGES CXX)
|
4
|
+
|
5
|
+
option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
|
6
|
+
|
7
|
+
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
8
|
+
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
9
|
+
|
10
|
+
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
11
|
+
|
12
|
+
#
|
13
|
+
# Supported python versions. These versions will be searched in order, the
|
14
|
+
# first match will be selected. These should be kept in sync with setup.py.
|
15
|
+
#
|
16
|
+
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
|
17
|
+
|
18
|
+
# Supported NVIDIA architectures.
|
19
|
+
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
20
|
+
|
21
|
+
# Supported AMD GPU architectures.
|
22
|
+
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
|
23
|
+
|
24
|
+
#
|
25
|
+
# Supported/expected torch versions for CUDA/ROCm.
|
26
|
+
#
|
27
|
+
# Currently, having an incorrect pytorch version results in a warning
|
28
|
+
# rather than an error.
|
29
|
+
#
|
30
|
+
# Note: the CUDA torch version is derived from pyproject.toml and various
|
31
|
+
# requirements.txt files and should be kept consistent. The ROCm torch
|
32
|
+
# versions are derived from Dockerfile.rocm
|
33
|
+
#
|
34
|
+
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
|
35
|
+
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
|
36
|
+
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
|
37
|
+
|
38
|
+
#
|
39
|
+
# Try to find python package with an executable that exactly matches
|
40
|
+
# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
|
41
|
+
#
|
42
|
+
if (VLLM_PYTHON_EXECUTABLE)
|
43
|
+
find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
|
44
|
+
else()
|
45
|
+
message(FATAL_ERROR
|
46
|
+
"Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
|
47
|
+
" before running cmake configure.")
|
48
|
+
endif()
|
49
|
+
|
50
|
+
#
|
51
|
+
# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
|
52
|
+
#
|
53
|
+
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
|
54
|
+
|
55
|
+
# Ensure the 'nvcc' command is in the PATH
|
56
|
+
find_program(NVCC_EXECUTABLE nvcc)
|
57
|
+
if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
|
58
|
+
message(FATAL_ERROR "nvcc not found")
|
59
|
+
endif()
|
60
|
+
|
61
|
+
#
|
62
|
+
# Import torch cmake configuration.
|
63
|
+
# Torch also imports CUDA (and partially HIP) languages with some customizations,
|
64
|
+
# so there is no need to do this explicitly with check_language/enable_language,
|
65
|
+
# etc.
|
66
|
+
#
|
67
|
+
find_package(Torch REQUIRED)
|
68
|
+
|
69
|
+
#
|
70
|
+
# Normally `torch.utils.cpp_extension.CUDAExtension` would add
|
71
|
+
# `libtorch_python.so` for linking against an extension. Torch's cmake
|
72
|
+
# configuration does not include this library (presumably since the cmake
|
73
|
+
# config is used for standalone C++ binaries that link against torch).
|
74
|
+
# The `libtorch_python.so` library defines some of the glue code between
|
75
|
+
# torch/python via pybind and is required by VLLM extensions for this
|
76
|
+
# reason. So, add it by manually with `find_library` using torch's
|
77
|
+
# installed library path.
|
78
|
+
#
|
79
|
+
find_library(torch_python_LIBRARY torch_python PATHS
|
80
|
+
"${TORCH_INSTALL_PREFIX}/lib")
|
81
|
+
|
82
|
+
#
|
83
|
+
# Forward the non-CUDA device extensions to external CMake scripts.
|
84
|
+
#
|
85
|
+
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
|
86
|
+
NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
|
87
|
+
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
|
88
|
+
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
|
89
|
+
else()
|
90
|
+
message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
|
91
|
+
endif()
|
92
|
+
return()
|
93
|
+
endif()
|
94
|
+
|
95
|
+
#
|
96
|
+
# Set up GPU language and check the torch version and warn if it isn't
|
97
|
+
# what is expected.
|
98
|
+
#
|
99
|
+
if (NOT HIP_FOUND AND CUDA_FOUND)
|
100
|
+
set(VLLM_GPU_LANG "CUDA")
|
101
|
+
|
102
|
+
if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
|
103
|
+
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
|
104
|
+
"expected for CUDA build, saw ${Torch_VERSION} instead.")
|
105
|
+
endif()
|
106
|
+
elseif(HIP_FOUND)
|
107
|
+
set(VLLM_GPU_LANG "HIP")
|
108
|
+
|
109
|
+
# Importing torch recognizes and sets up some HIP/ROCm configuration but does
|
110
|
+
# not let cmake recognize .hip files. In order to get cmake to understand the
|
111
|
+
# .hip extension automatically, HIP must be enabled explicitly.
|
112
|
+
enable_language(HIP)
|
113
|
+
|
114
|
+
# ROCm 5.x
|
115
|
+
if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
|
116
|
+
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
|
117
|
+
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
|
118
|
+
"expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
|
119
|
+
endif()
|
120
|
+
|
121
|
+
# ROCm 6.x
|
122
|
+
if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
|
123
|
+
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
|
124
|
+
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
|
125
|
+
"expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
|
126
|
+
endif()
|
127
|
+
else()
|
128
|
+
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
129
|
+
endif()
|
130
|
+
|
131
|
+
#
|
132
|
+
# Override the GPU architectures detected by cmake/torch and filter them by
|
133
|
+
# the supported versions for the current language.
|
134
|
+
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
135
|
+
#
|
136
|
+
override_gpu_arches(VLLM_GPU_ARCHES
|
137
|
+
${VLLM_GPU_LANG}
|
138
|
+
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
139
|
+
|
140
|
+
#
|
141
|
+
# Query torch for additional GPU compilation flags for the given
|
142
|
+
# `VLLM_GPU_LANG`.
|
143
|
+
# The final set of arches is stored in `VLLM_GPU_FLAGS`.
|
144
|
+
#
|
145
|
+
get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
|
146
|
+
|
147
|
+
#
|
148
|
+
# Set nvcc parallelism.
|
149
|
+
#
|
150
|
+
if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
151
|
+
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
152
|
+
endif()
|
153
|
+
|
154
|
+
#
|
155
|
+
# Define extension targets
|
156
|
+
#
|
157
|
+
|
158
|
+
#
|
159
|
+
# _C extension
|
160
|
+
#
|
161
|
+
|
162
|
+
set(VLLM_EXT_SRC
|
163
|
+
"csrc/cache_kernels.cu"
|
164
|
+
"csrc/attention/attention_kernels.cu"
|
165
|
+
"csrc/pos_encoding_kernels.cu"
|
166
|
+
"csrc/activation_kernels.cu"
|
167
|
+
"csrc/layernorm_kernels.cu"
|
168
|
+
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
|
169
|
+
"csrc/quantization/gptq/q_gemm.cu"
|
170
|
+
"csrc/quantization/fp8/fp8_cuda_kernels.cu"
|
171
|
+
"csrc/cuda_utils_kernels.cu"
|
172
|
+
"csrc/moe_align_block_size_kernels.cu"
|
173
|
+
"csrc/pybind.cpp")
|
174
|
+
|
175
|
+
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
176
|
+
list(APPEND VLLM_EXT_SRC
|
177
|
+
"csrc/quantization/aqlm/gemm_kernels.cu"
|
178
|
+
"csrc/quantization/awq/gemm_kernels.cu"
|
179
|
+
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
|
180
|
+
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
181
|
+
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
182
|
+
"csrc/custom_all_reduce.cu")
|
183
|
+
endif()
|
184
|
+
|
185
|
+
define_gpu_extension_target(
|
186
|
+
_C
|
187
|
+
DESTINATION vllm
|
188
|
+
LANGUAGE ${VLLM_GPU_LANG}
|
189
|
+
SOURCES ${VLLM_EXT_SRC}
|
190
|
+
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
191
|
+
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
192
|
+
WITH_SOABI)
|
193
|
+
|
194
|
+
#
|
195
|
+
# _moe_C extension
|
196
|
+
#
|
197
|
+
|
198
|
+
set(VLLM_MOE_EXT_SRC
|
199
|
+
"csrc/moe/moe_ops.cpp"
|
200
|
+
"csrc/moe/topk_softmax_kernels.cu")
|
201
|
+
|
202
|
+
define_gpu_extension_target(
|
203
|
+
_moe_C
|
204
|
+
DESTINATION vllm
|
205
|
+
LANGUAGE ${VLLM_GPU_LANG}
|
206
|
+
SOURCES ${VLLM_MOE_EXT_SRC}
|
207
|
+
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
208
|
+
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
209
|
+
WITH_SOABI)
|
210
|
+
|
211
|
+
#
|
212
|
+
# _punica_C extension
|
213
|
+
#
|
214
|
+
|
215
|
+
set(VLLM_PUNICA_EXT_SRC
|
216
|
+
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
|
217
|
+
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
|
218
|
+
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
|
219
|
+
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
|
220
|
+
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
|
221
|
+
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
|
222
|
+
"csrc/punica/punica_ops.cc")
|
223
|
+
|
224
|
+
#
|
225
|
+
# Copy GPU compilation flags+update for punica
|
226
|
+
#
|
227
|
+
set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
228
|
+
list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
|
229
|
+
"-D__CUDA_NO_HALF_OPERATORS__"
|
230
|
+
"-D__CUDA_NO_HALF_CONVERSIONS__"
|
231
|
+
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
|
232
|
+
"-D__CUDA_NO_HALF2_OPERATORS__")
|
233
|
+
|
234
|
+
#
|
235
|
+
# Filter out CUDA architectures < 8.0 for punica.
|
236
|
+
#
|
237
|
+
if (${VLLM_GPU_LANG} STREQUAL "CUDA")
|
238
|
+
set(VLLM_PUNICA_GPU_ARCHES)
|
239
|
+
foreach(ARCH ${VLLM_GPU_ARCHES})
|
240
|
+
string_to_ver(CODE_VER ${ARCH})
|
241
|
+
if (CODE_VER GREATER_EQUAL 8.0)
|
242
|
+
list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
|
243
|
+
endif()
|
244
|
+
endforeach()
|
245
|
+
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
|
246
|
+
endif()
|
247
|
+
|
248
|
+
if (VLLM_PUNICA_GPU_ARCHES)
|
249
|
+
define_gpu_extension_target(
|
250
|
+
_punica_C
|
251
|
+
DESTINATION vllm
|
252
|
+
LANGUAGE ${VLLM_GPU_LANG}
|
253
|
+
SOURCES ${VLLM_PUNICA_EXT_SRC}
|
254
|
+
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
|
255
|
+
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
|
256
|
+
WITH_SOABI)
|
257
|
+
else()
|
258
|
+
message(WARNING "Unable to create _punica_C target because none of the "
|
259
|
+
"requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
|
260
|
+
endif()
|
261
|
+
|
262
|
+
#
|
263
|
+
# Add the `default` target which detects which extensions should be
|
264
|
+
# built based on platform/architecture. This is the same logic that
|
265
|
+
# setup.py uses to select which extensions should be built and should
|
266
|
+
# be kept in sync.
|
267
|
+
#
|
268
|
+
# The `default` target makes direct use of cmake easier since knowledge
|
269
|
+
# of which extensions are supported has been factored in, e.g.
|
270
|
+
#
|
271
|
+
# mkdir build && cd build
|
272
|
+
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
|
273
|
+
# cmake --build . --target default
|
274
|
+
#
|
275
|
+
add_custom_target(default)
|
276
|
+
|
277
|
+
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
278
|
+
message(STATUS "Enabling C extension.")
|
279
|
+
add_dependencies(default _C)
|
280
|
+
endif()
|
281
|
+
|
282
|
+
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
283
|
+
message(STATUS "Enabling moe extension.")
|
284
|
+
add_dependencies(default _moe_C)
|
285
|
+
|
286
|
+
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
|
287
|
+
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
|
288
|
+
# there are supported target arches.
|
289
|
+
if (VLLM_PUNICA_GPU_ARCHES AND
|
290
|
+
(ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
|
291
|
+
message(STATUS "Enabling punica extension.")
|
292
|
+
add_dependencies(default _punica_C)
|
293
|
+
endif()
|
294
|
+
endif()
|
vllm_npu-0.4.2/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
Apache License
|
2
|
+
Version 2.0, January 2004
|
3
|
+
http://www.apache.org/licenses/
|
4
|
+
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6
|
+
|
7
|
+
1. Definitions.
|
8
|
+
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
11
|
+
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13
|
+
the copyright owner that is granting the License.
|
14
|
+
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
16
|
+
other entities that control, are controlled by, or are under common
|
17
|
+
control with that entity. For the purposes of this definition,
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
19
|
+
direction or management of such entity, whether by contract or
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22
|
+
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24
|
+
exercising permissions granted by this License.
|
25
|
+
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
27
|
+
including but not limited to software source code, documentation
|
28
|
+
source, and configuration files.
|
29
|
+
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
31
|
+
transformation or translation of a Source form, including but
|
32
|
+
not limited to compiled object code, generated documentation,
|
33
|
+
and conversions to other media types.
|
34
|
+
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
36
|
+
Object form, made available under the License, as indicated by a
|
37
|
+
copyright notice that is included in or attached to the work
|
38
|
+
(an example is provided in the Appendix below).
|
39
|
+
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46
|
+
the Work and Derivative Works thereof.
|
47
|
+
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
49
|
+
the original version of the Work and any modifications or additions
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
61
|
+
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
64
|
+
subsequently incorporated within the Work.
|
65
|
+
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
72
|
+
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78
|
+
where such license applies only to those patent claims licensable
|
79
|
+
by such Contributor that are necessarily infringed by their
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
82
|
+
institute patent litigation against any entity (including a
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
85
|
+
or contributory patent infringement, then any patent licenses
|
86
|
+
granted to You under this License for that Work shall terminate
|
87
|
+
as of the date such litigation is filed.
|
88
|
+
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
91
|
+
modifications, and in Source or Object form, provided that You
|
92
|
+
meet the following conditions:
|
93
|
+
|
94
|
+
(a) You must give any other recipients of the Work or
|
95
|
+
Derivative Works a copy of this License; and
|
96
|
+
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
98
|
+
stating that You changed the files; and
|
99
|
+
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
102
|
+
attribution notices from the Source form of the Work,
|
103
|
+
excluding those notices that do not pertain to any part of
|
104
|
+
the Derivative Works; and
|
105
|
+
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
108
|
+
include a readable copy of the attribution notices contained
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
111
|
+
of the following places: within a NOTICE text file distributed
|
112
|
+
as part of the Derivative Works; within the Source form or
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
114
|
+
within a display generated by the Derivative Works, if and
|
115
|
+
wherever such third-party notices normally appear. The contents
|
116
|
+
of the NOTICE file are for informational purposes only and
|
117
|
+
do not modify the License. You may add Your own attribution
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
120
|
+
that such additional attribution notices cannot be construed
|
121
|
+
as modifying the License.
|
122
|
+
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
124
|
+
may provide additional or different license terms and conditions
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
128
|
+
the conditions stated in this License.
|
129
|
+
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
133
|
+
this License, without any additional terms or conditions.
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135
|
+
the terms of any separate license agreement you may have executed
|
136
|
+
with Licensor regarding such Contributions.
|
137
|
+
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
140
|
+
except as required for reasonable and customary use in describing the
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
142
|
+
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
152
|
+
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
158
|
+
incidental, or consequential damages of any character arising as a
|
159
|
+
result of this License or out of the use or inability to use the
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
162
|
+
other commercial damages or losses), even if such Contributor
|
163
|
+
has been advised of the possibility of such damages.
|
164
|
+
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168
|
+
or other liability obligations and/or rights consistent with this
|
169
|
+
License. However, in accepting such obligations, You may act only
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
174
|
+
of your accepting any such warranty or additional liability.
|
175
|
+
|
176
|
+
END OF TERMS AND CONDITIONS
|
177
|
+
|
178
|
+
APPENDIX: How to apply the Apache License to your work.
|
179
|
+
|
180
|
+
To apply the Apache License to your work, attach the following
|
181
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182
|
+
replaced with your own identifying information. (Don't include
|
183
|
+
the brackets!) The text should be enclosed in the appropriate
|
184
|
+
comment syntax for the file format. We also recommend that a
|
185
|
+
file or class name and description of purpose be included on the
|
186
|
+
same "printed page" as the copyright notice for easier
|
187
|
+
identification within third-party archives.
|
188
|
+
|
189
|
+
Copyright [yyyy] [name of copyright owner]
|
190
|
+
|
191
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192
|
+
you may not use this file except in compliance with the License.
|
193
|
+
You may obtain a copy of the License at
|
194
|
+
|
195
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
196
|
+
|
197
|
+
Unless required by applicable law or agreed to in writing, software
|
198
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200
|
+
See the License for the specific language governing permissions and
|
201
|
+
limitations under the License.
|
@@ -0,0 +1,10 @@
|
|
1
|
+
include LICENSE
|
2
|
+
include requirements-common.txt
|
3
|
+
include requirements-cuda.txt
|
4
|
+
include requirements-rocm.txt
|
5
|
+
include requirements-neuron.txt
|
6
|
+
include requirements-cpu.txt
|
7
|
+
include CMakeLists.txt
|
8
|
+
|
9
|
+
recursive-include cmake *
|
10
|
+
recursive-include csrc *
|
vllm_npu-0.4.2/PKG-INFO
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: vllm_npu
|
3
|
+
Version: 0.4.2
|
4
|
+
Summary: A high-throughput and memory-efficient inference and serving engine for LLMs
|
5
|
+
Home-page: https://github.com/vllm-project/vllm
|
6
|
+
Author: vLLM Team
|
7
|
+
License: Apache 2.0
|
8
|
+
Project-URL: Homepage, https://github.com/vllm-project/vllm
|
9
|
+
Project-URL: Documentation, https://vllm.readthedocs.io/en/latest/
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16
|
+
Requires-Python: >=3.8
|
17
|
+
Description-Content-Type: text/markdown
|
18
|
+
License-File: LICENSE
|
19
|
+
Requires-Dist: cmake>=3.21
|
20
|
+
Requires-Dist: ninja
|
21
|
+
Requires-Dist: psutil
|
22
|
+
Requires-Dist: sentencepiece
|
23
|
+
Requires-Dist: numpy
|
24
|
+
Requires-Dist: requests
|
25
|
+
Requires-Dist: py-cpuinfo
|
26
|
+
Requires-Dist: transformers>=4.40.0
|
27
|
+
Requires-Dist: tokenizers>=0.19.1
|
28
|
+
Requires-Dist: fastapi
|
29
|
+
Requires-Dist: openai
|
30
|
+
Requires-Dist: uvicorn[standard]
|
31
|
+
Requires-Dist: pydantic>=2.0
|
32
|
+
Requires-Dist: prometheus_client>=0.18.0
|
33
|
+
Requires-Dist: prometheus-fastapi-instrumentator>=7.0.0
|
34
|
+
Requires-Dist: tiktoken==0.6.0
|
35
|
+
Requires-Dist: lm-format-enforcer==0.10.1
|
36
|
+
Requires-Dist: typing_extensions
|
37
|
+
Requires-Dist: filelock>=3.10.4
|
38
|
+
Requires-Dist: ray==2.9.3
|
39
|
+
Requires-Dist: pynvml==11.5.0
|
40
|
+
Requires-Dist: outlines==0.0.34
|
41
|
+
Requires-Dist: npu-vllm==0.4.2
|
42
|
+
Provides-Extra: tensorizer
|
43
|
+
Requires-Dist: tensorizer==2.9.0; extra == "tensorizer"
|
44
|
+
Dynamic: author
|
45
|
+
Dynamic: classifier
|
46
|
+
Dynamic: description
|
47
|
+
Dynamic: description-content-type
|
48
|
+
Dynamic: home-page
|
49
|
+
Dynamic: license
|
50
|
+
Dynamic: project-url
|
51
|
+
Dynamic: provides-extra
|
52
|
+
Dynamic: requires-dist
|
53
|
+
Dynamic: requires-python
|
54
|
+
Dynamic: summary
|
55
|
+
|
56
|
+
<p align="center">
|
57
|
+
<picture>
|
58
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
59
|
+
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
60
|
+
</picture>
|
61
|
+
</p>
|
62
|
+
|
63
|
+
<h3 align="center">
|
64
|
+
Easy, fast, and cheap LLM serving for everyone
|
65
|
+
</h3>
|
66
|
+
|
67
|
+
<p align="center">
|
68
|
+
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
|
69
|
+
|
70
|
+
</p>
|
71
|
+
|
72
|
+
*Latest News* 🔥
|
73
|
+
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
74
|
+
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
75
|
+
- [2024/01] Added ROCm 6.0 support to vLLM.
|
76
|
+
- [2023/12] Added ROCm 5.7 support to vLLM.
|
77
|
+
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
78
|
+
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
|
79
|
+
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
|
80
|
+
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
81
|
+
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
|
82
|
+
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
|
83
|
+
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
84
|
+
|
85
|
+
---
|
86
|
+
## About
|
87
|
+
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
88
|
+
|
89
|
+
vLLM is fast with:
|
90
|
+
|
91
|
+
- State-of-the-art serving throughput
|
92
|
+
- Efficient management of attention key and value memory with **PagedAttention**
|
93
|
+
- Continuous batching of incoming requests
|
94
|
+
- Fast model execution with CUDA/HIP graph
|
95
|
+
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
|
96
|
+
- Optimized CUDA kernels
|
97
|
+
|
98
|
+
vLLM is flexible and easy to use with:
|
99
|
+
|
100
|
+
- Seamless integration with popular Hugging Face models
|
101
|
+
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
102
|
+
- Tensor parallelism support for distributed inference
|
103
|
+
- Streaming outputs
|
104
|
+
- OpenAI-compatible API server
|
105
|
+
- Support NVIDIA GPUs and AMD GPUs
|
106
|
+
- (Experimental) Prefix caching support
|
107
|
+
- (Experimental) Multi-lora support
|
108
|
+
|
109
|
+
vLLM seamlessly supports many Hugging Face models, including the following architectures:
|
110
|
+
|
111
|
+
- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
|
112
|
+
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
|
113
|
+
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
114
|
+
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
|
115
|
+
- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
|
116
|
+
- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
|
117
|
+
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
|
118
|
+
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
119
|
+
- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
|
120
|
+
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
|
121
|
+
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
122
|
+
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
|
123
|
+
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
124
|
+
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
|
125
|
+
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
|
126
|
+
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
|
127
|
+
- LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
128
|
+
- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
|
129
|
+
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
|
130
|
+
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
|
131
|
+
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
132
|
+
- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
|
133
|
+
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
134
|
+
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
|
135
|
+
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
|
136
|
+
- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
|
137
|
+
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
138
|
+
- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
|
139
|
+
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
|
140
|
+
- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
|
141
|
+
- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
|
142
|
+
- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
|
143
|
+
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
|
144
|
+
|
145
|
+
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
146
|
+
|
147
|
+
```bash
|
148
|
+
pip install vllm
|
149
|
+
```
|
150
|
+
|
151
|
+
## Getting Started
|
152
|
+
|
153
|
+
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
|
154
|
+
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
155
|
+
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
156
|
+
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
157
|
+
|
158
|
+
## Contributing
|
159
|
+
|
160
|
+
We welcome and value any contributions and collaborations.
|
161
|
+
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
162
|
+
|
163
|
+
## Citation
|
164
|
+
|
165
|
+
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
166
|
+
```bibtex
|
167
|
+
@inproceedings{kwon2023efficient,
|
168
|
+
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
169
|
+
author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
|
170
|
+
booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
|
171
|
+
year={2023}
|
172
|
+
}
|
173
|
+
```
|