PyPI - vllm-npu - Versions diffs - 0.4.2__tar.gz - Mend

vllm-npu 0.4.2__tar.gz

Files changed (310) hide show

vllm_npu-0.4.2/CMakeLists.txt +294 -0
vllm_npu-0.4.2/LICENSE +201 -0
vllm_npu-0.4.2/MANIFEST.in +10 -0
vllm_npu-0.4.2/PKG-INFO +173 -0
vllm_npu-0.4.2/README.md +118 -0
vllm_npu-0.4.2/cmake/cpu_extension.cmake +90 -0
vllm_npu-0.4.2/cmake/hipify.py +73 -0
vllm_npu-0.4.2/cmake/utils.cmake +354 -0
vllm_npu-0.4.2/csrc/activation_kernels.cu +161 -0
vllm_npu-0.4.2/csrc/attention/attention_dtypes.h +7 -0
vllm_npu-0.4.2/csrc/attention/attention_generic.cuh +64 -0
vllm_npu-0.4.2/csrc/attention/attention_kernels.cu +980 -0
vllm_npu-0.4.2/csrc/attention/attention_utils.cuh +56 -0
vllm_npu-0.4.2/csrc/attention/dtype_bfloat16.cuh +451 -0
vllm_npu-0.4.2/csrc/attention/dtype_float16.cuh +502 -0
vllm_npu-0.4.2/csrc/attention/dtype_float32.cuh +273 -0
vllm_npu-0.4.2/csrc/attention/dtype_fp8.cuh +35 -0
vllm_npu-0.4.2/csrc/cache.h +38 -0
vllm_npu-0.4.2/csrc/cache_kernels.cu +419 -0
vllm_npu-0.4.2/csrc/cpu/activation.cpp +148 -0
vllm_npu-0.4.2/csrc/cpu/attention.cpp +746 -0
vllm_npu-0.4.2/csrc/cpu/cache.cpp +141 -0
vllm_npu-0.4.2/csrc/cpu/cpu_types.hpp +352 -0
vllm_npu-0.4.2/csrc/cpu/layernorm.cpp +117 -0
vllm_npu-0.4.2/csrc/cpu/pos_encoding.cpp +199 -0
vllm_npu-0.4.2/csrc/cpu/pybind.cpp +73 -0
vllm_npu-0.4.2/csrc/cuda_compat.h +38 -0
vllm_npu-0.4.2/csrc/cuda_utils.h +10 -0
vllm_npu-0.4.2/csrc/cuda_utils_kernels.cu +35 -0
vllm_npu-0.4.2/csrc/custom_all_reduce.cu +148 -0
vllm_npu-0.4.2/csrc/custom_all_reduce.cuh +485 -0
vllm_npu-0.4.2/csrc/custom_all_reduce_test.cu +316 -0
vllm_npu-0.4.2/csrc/dispatch_utils.h +37 -0
vllm_npu-0.4.2/csrc/layernorm_kernels.cu +352 -0
vllm_npu-0.4.2/csrc/moe/moe_ops.cpp +7 -0
vllm_npu-0.4.2/csrc/moe/moe_ops.h +9 -0
vllm_npu-0.4.2/csrc/moe/topk_softmax_kernels.cu +499 -0
vllm_npu-0.4.2/csrc/moe_align_block_size_kernels.cu +125 -0
vllm_npu-0.4.2/csrc/ops.h +206 -0
vllm_npu-0.4.2/csrc/pos_encoding_kernels.cu +226 -0
vllm_npu-0.4.2/csrc/punica/LICENSE +217 -0
vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu +5 -0
vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu +5 -0
vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_config.h +162 -0
vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu +5 -0
vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu +5 -0
vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu +5 -0
vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu +5 -0
vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_impl.cuh +297 -0
vllm_npu-0.4.2/csrc/punica/bgmv/generator.py +48 -0
vllm_npu-0.4.2/csrc/punica/bgmv/vec_dtypes.cuh +1324 -0
vllm_npu-0.4.2/csrc/punica/punica_ops.cc +582 -0
vllm_npu-0.4.2/csrc/pybind.cpp +136 -0
vllm_npu-0.4.2/csrc/quantization/aqlm/gemm_kernels.cu +712 -0
vllm_npu-0.4.2/csrc/quantization/awq/dequantize.cuh +87 -0
vllm_npu-0.4.2/csrc/quantization/awq/gemm_kernels.cu +446 -0
vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/hip_float8.h +167 -0
vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/hip_float8_impl.h +316 -0
vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/quant_utils.cuh +517 -0
vllm_npu-0.4.2/csrc/quantization/fp8/fp8_cuda_kernels.cu +126 -0
vllm_npu-0.4.2/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh +277 -0
vllm_npu-0.4.2/csrc/quantization/gptq/compat.cuh +64 -0
vllm_npu-0.4.2/csrc/quantization/gptq/matrix_view.cuh +274 -0
vllm_npu-0.4.2/csrc/quantization/gptq/q_gemm.cu +2075 -0
vllm_npu-0.4.2/csrc/quantization/gptq/qdq_2.cuh +87 -0
vllm_npu-0.4.2/csrc/quantization/gptq/qdq_3.cuh +141 -0
vllm_npu-0.4.2/csrc/quantization/gptq/qdq_4.cuh +147 -0
vllm_npu-0.4.2/csrc/quantization/gptq/qdq_8.cuh +40 -0
vllm_npu-0.4.2/csrc/quantization/gptq/qdq_util.cuh +60 -0
vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin.cu +1722 -0
vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin.cuh +70 -0
vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu +352 -0
vllm_npu-0.4.2/csrc/quantization/marlin/LICENSE +209 -0
vllm_npu-0.4.2/csrc/quantization/marlin/marlin_cuda_kernel.cu +1138 -0
vllm_npu-0.4.2/csrc/quantization/squeezellm/quant_cuda_kernel.cu +225 -0
vllm_npu-0.4.2/csrc/reduction_utils.cuh +65 -0
vllm_npu-0.4.2/pyproject.toml +67 -0
vllm_npu-0.4.2/requirements-common.txt +20 -0
vllm_npu-0.4.2/requirements-cpu.txt +6 -0
vllm_npu-0.4.2/requirements-cuda.txt +9 -0
vllm_npu-0.4.2/requirements-neuron.txt +7 -0
vllm_npu-0.4.2/requirements-rocm.txt +5 -0
vllm_npu-0.4.2/setup.cfg +4 -0
vllm_npu-0.4.2/setup.py +299 -0
vllm_npu-0.4.2/tests/test_cache_block_hashing.py +93 -0
vllm_npu-0.4.2/tests/test_config.py +39 -0
vllm_npu-0.4.2/tests/test_logger.py +214 -0
vllm_npu-0.4.2/tests/test_logits_processor.py +103 -0
vllm_npu-0.4.2/tests/test_regression.py +58 -0
vllm_npu-0.4.2/tests/test_sampling_params.py +13 -0
vllm_npu-0.4.2/tests/test_sequence.py +124 -0
vllm_npu-0.4.2/vllm/__init__.py +23 -0
vllm_npu-0.4.2/vllm/_custom_ops.py +251 -0
vllm_npu-0.4.2/vllm/attention/__init__.py +13 -0
vllm_npu-0.4.2/vllm/attention/backends/__init__.py +0 -0
vllm_npu-0.4.2/vllm/attention/backends/abstract.py +127 -0
vllm_npu-0.4.2/vllm/attention/backends/flash_attn.py +271 -0
vllm_npu-0.4.2/vllm/attention/backends/flashinfer.py +220 -0
vllm_npu-0.4.2/vllm/attention/backends/rocm_flash_attn.py +374 -0
vllm_npu-0.4.2/vllm/attention/backends/torch_sdpa.py +250 -0
vllm_npu-0.4.2/vllm/attention/backends/xformers.py +393 -0
vllm_npu-0.4.2/vllm/attention/layer.py +56 -0
vllm_npu-0.4.2/vllm/attention/ops/__init__.py +0 -0
vllm_npu-0.4.2/vllm/attention/ops/paged_attn.py +216 -0
vllm_npu-0.4.2/vllm/attention/ops/prefix_prefill.py +792 -0
vllm_npu-0.4.2/vllm/attention/ops/triton_flash_attention.py +810 -0
vllm_npu-0.4.2/vllm/attention/selector.py +91 -0
vllm_npu-0.4.2/vllm/block.py +84 -0
vllm_npu-0.4.2/vllm/config.py +1225 -0
vllm_npu-0.4.2/vllm/core/__init__.py +0 -0
vllm_npu-0.4.2/vllm/core/block/__init__.py +0 -0
vllm_npu-0.4.2/vllm/core/block/block_table.py +295 -0
vllm_npu-0.4.2/vllm/core/block/common.py +199 -0
vllm_npu-0.4.2/vllm/core/block/cpu_gpu_block_allocator.py +228 -0
vllm_npu-0.4.2/vllm/core/block/interfaces.py +205 -0
vllm_npu-0.4.2/vllm/core/block/naive_block.py +318 -0
vllm_npu-0.4.2/vllm/core/block/prefix_caching_block.py +606 -0
vllm_npu-0.4.2/vllm/core/block_manager_v1.py +625 -0
vllm_npu-0.4.2/vllm/core/block_manager_v2.py +258 -0
vllm_npu-0.4.2/vllm/core/evictor_v1.py +105 -0
vllm_npu-0.4.2/vllm/core/evictor_v2.py +127 -0
vllm_npu-0.4.2/vllm/core/interfaces.py +113 -0
vllm_npu-0.4.2/vllm/core/policy.py +45 -0
vllm_npu-0.4.2/vllm/core/scheduler.py +1163 -0
vllm_npu-0.4.2/vllm/distributed/__init__.py +3 -0
vllm_npu-0.4.2/vllm/distributed/communication_op.py +237 -0
vllm_npu-0.4.2/vllm/distributed/device_communicators/__init__.py +0 -0
vllm_npu-0.4.2/vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
vllm_npu-0.4.2/vllm/distributed/device_communicators/pynccl.py +287 -0
vllm_npu-0.4.2/vllm/distributed/device_communicators/pynccl_utils.py +66 -0
vllm_npu-0.4.2/vllm/distributed/parallel_state.py +339 -0
vllm_npu-0.4.2/vllm/distributed/utils.py +136 -0
vllm_npu-0.4.2/vllm/engine/__init__.py +0 -0
vllm_npu-0.4.2/vllm/engine/arg_utils.py +649 -0
vllm_npu-0.4.2/vllm/engine/async_llm_engine.py +737 -0
vllm_npu-0.4.2/vllm/engine/llm_engine.py +784 -0
vllm_npu-0.4.2/vllm/engine/metrics.py +368 -0
vllm_npu-0.4.2/vllm/engine/output_processor/__init__.py +0 -0
vllm_npu-0.4.2/vllm/engine/output_processor/interfaces.py +76 -0
vllm_npu-0.4.2/vllm/engine/output_processor/multi_step.py +142 -0
vllm_npu-0.4.2/vllm/engine/output_processor/single_step.py +284 -0
vllm_npu-0.4.2/vllm/engine/output_processor/stop_checker.py +101 -0
vllm_npu-0.4.2/vllm/engine/output_processor/util.py +19 -0
vllm_npu-0.4.2/vllm/entrypoints/__init__.py +0 -0
vllm_npu-0.4.2/vllm/entrypoints/api_server.py +119 -0
vllm_npu-0.4.2/vllm/entrypoints/llm.py +259 -0
vllm_npu-0.4.2/vllm/entrypoints/openai/__init__.py +0 -0
vllm_npu-0.4.2/vllm/entrypoints/openai/api_server.py +186 -0
vllm_npu-0.4.2/vllm/entrypoints/openai/cli_args.py +115 -0
vllm_npu-0.4.2/vllm/entrypoints/openai/protocol.py +460 -0
vllm_npu-0.4.2/vllm/entrypoints/openai/serving_chat.py +392 -0
vllm_npu-0.4.2/vllm/entrypoints/openai/serving_completion.py +347 -0
vllm_npu-0.4.2/vllm/entrypoints/openai/serving_engine.py +234 -0
vllm_npu-0.4.2/vllm/envs.py +217 -0
vllm_npu-0.4.2/vllm/executor/__init__.py +0 -0
vllm_npu-0.4.2/vllm/executor/cpu_executor.py +152 -0
vllm_npu-0.4.2/vllm/executor/distributed_gpu_executor.py +115 -0
vllm_npu-0.4.2/vllm/executor/executor_base.py +115 -0
vllm_npu-0.4.2/vllm/executor/gpu_executor.py +150 -0
vllm_npu-0.4.2/vllm/executor/multiproc_worker_utils.py +263 -0
vllm_npu-0.4.2/vllm/executor/neuron_executor.py +91 -0
vllm_npu-0.4.2/vllm/executor/ray_gpu_executor.py +327 -0
vllm_npu-0.4.2/vllm/executor/ray_utils.py +119 -0
vllm_npu-0.4.2/vllm/logger.py +153 -0
vllm_npu-0.4.2/vllm/logging/__init__.py +5 -0
vllm_npu-0.4.2/vllm/logging/formatter.py +15 -0
vllm_npu-0.4.2/vllm/lora/__init__.py +0 -0
vllm_npu-0.4.2/vllm/lora/fully_sharded_layers.py +262 -0
vllm_npu-0.4.2/vllm/lora/layers.py +1181 -0
vllm_npu-0.4.2/vllm/lora/lora.py +167 -0
vllm_npu-0.4.2/vllm/lora/models.py +645 -0
vllm_npu-0.4.2/vllm/lora/punica.py +213 -0
vllm_npu-0.4.2/vllm/lora/request.py +32 -0
vllm_npu-0.4.2/vllm/lora/utils.py +98 -0
vllm_npu-0.4.2/vllm/lora/worker_manager.py +251 -0
vllm_npu-0.4.2/vllm/model_executor/__init__.py +7 -0
vllm_npu-0.4.2/vllm/model_executor/guided_decoding/__init__.py +25 -0
vllm_npu-0.4.2/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
vllm_npu-0.4.2/vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
vllm_npu-0.4.2/vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
vllm_npu-0.4.2/vllm/model_executor/layers/__init__.py +0 -0
vllm_npu-0.4.2/vllm/model_executor/layers/activation.py +173 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/__init__.py +7 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
vllm_npu-0.4.2/vllm/model_executor/layers/layernorm.py +71 -0
vllm_npu-0.4.2/vllm/model_executor/layers/linear.py +709 -0
vllm_npu-0.4.2/vllm/model_executor/layers/logits_processor.py +115 -0
vllm_npu-0.4.2/vllm/model_executor/layers/ops/__init__.py +0 -0
vllm_npu-0.4.2/vllm/model_executor/layers/ops/rand.py +157 -0
vllm_npu-0.4.2/vllm/model_executor/layers/ops/sample.py +406 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/__init__.py +35 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/aqlm.py +376 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/awq.py +175 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/base_config.py +97 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/fp8.py +265 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/gptq.py +224 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/marlin.py +227 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/schema.py +84 -0
vllm_npu-0.4.2/vllm/model_executor/layers/quantization/squeezellm.py +137 -0
vllm_npu-0.4.2/vllm/model_executor/layers/rejection_sampler.py +405 -0
vllm_npu-0.4.2/vllm/model_executor/layers/rotary_embedding.py +525 -0
vllm_npu-0.4.2/vllm/model_executor/layers/sampler.py +1051 -0
vllm_npu-0.4.2/vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
vllm_npu-0.4.2/vllm/model_executor/model_loader/__init__.py +30 -0
vllm_npu-0.4.2/vllm/model_executor/model_loader/loader.py +362 -0
vllm_npu-0.4.2/vllm/model_executor/model_loader/neuron.py +136 -0
vllm_npu-0.4.2/vllm/model_executor/model_loader/tensorizer.py +368 -0
vllm_npu-0.4.2/vllm/model_executor/model_loader/utils.py +41 -0
vllm_npu-0.4.2/vllm/model_executor/model_loader/weight_utils.py +372 -0
vllm_npu-0.4.2/vllm/model_executor/models/__init__.py +119 -0
vllm_npu-0.4.2/vllm/model_executor/models/baichuan.py +410 -0
vllm_npu-0.4.2/vllm/model_executor/models/bloom.py +327 -0
vllm_npu-0.4.2/vllm/model_executor/models/chatglm.py +386 -0
vllm_npu-0.4.2/vllm/model_executor/models/commandr.py +373 -0
vllm_npu-0.4.2/vllm/model_executor/models/dbrx.py +413 -0
vllm_npu-0.4.2/vllm/model_executor/models/decilm.py +122 -0
vllm_npu-0.4.2/vllm/model_executor/models/deepseek.py +438 -0
vllm_npu-0.4.2/vllm/model_executor/models/falcon.py +444 -0
vllm_npu-0.4.2/vllm/model_executor/models/gemma.py +393 -0
vllm_npu-0.4.2/vllm/model_executor/models/gpt2.py +266 -0
vllm_npu-0.4.2/vllm/model_executor/models/gpt_bigcode.py +274 -0
vllm_npu-0.4.2/vllm/model_executor/models/gpt_j.py +281 -0
vllm_npu-0.4.2/vllm/model_executor/models/gpt_neox.py +295 -0
vllm_npu-0.4.2/vllm/model_executor/models/internlm2.py +323 -0
vllm_npu-0.4.2/vllm/model_executor/models/jais.py +333 -0
vllm_npu-0.4.2/vllm/model_executor/models/llama.py +442 -0
vllm_npu-0.4.2/vllm/model_executor/models/llava.py +239 -0
vllm_npu-0.4.2/vllm/model_executor/models/minicpm.py +531 -0
vllm_npu-0.4.2/vllm/model_executor/models/mixtral.py +583 -0
vllm_npu-0.4.2/vllm/model_executor/models/mixtral_quant.py +404 -0
vllm_npu-0.4.2/vllm/model_executor/models/mpt.py +295 -0
vllm_npu-0.4.2/vllm/model_executor/models/olmo.py +356 -0
vllm_npu-0.4.2/vllm/model_executor/models/opt.py +349 -0
vllm_npu-0.4.2/vllm/model_executor/models/orion.py +319 -0
vllm_npu-0.4.2/vllm/model_executor/models/phi.py +300 -0
vllm_npu-0.4.2/vllm/model_executor/models/qwen.py +284 -0
vllm_npu-0.4.2/vllm/model_executor/models/qwen2.py +367 -0
vllm_npu-0.4.2/vllm/model_executor/models/qwen2_moe.py +447 -0
vllm_npu-0.4.2/vllm/model_executor/models/stablelm.py +301 -0
vllm_npu-0.4.2/vllm/model_executor/models/starcoder2.py +302 -0
vllm_npu-0.4.2/vllm/model_executor/models/xverse.py +366 -0
vllm_npu-0.4.2/vllm/model_executor/sampling_metadata.py +588 -0
vllm_npu-0.4.2/vllm/model_executor/utils.py +35 -0
vllm_npu-0.4.2/vllm/outputs.py +150 -0
vllm_npu-0.4.2/vllm/py.typed +2 -0
vllm_npu-0.4.2/vllm/sampling_params.py +340 -0
vllm_npu-0.4.2/vllm/sequence.py +766 -0
vllm_npu-0.4.2/vllm/spec_decode/__init__.py +0 -0
vllm_npu-0.4.2/vllm/spec_decode/batch_expansion.py +397 -0
vllm_npu-0.4.2/vllm/spec_decode/interfaces.py +73 -0
vllm_npu-0.4.2/vllm/spec_decode/metrics.py +191 -0
vllm_npu-0.4.2/vllm/spec_decode/multi_step_worker.py +203 -0
vllm_npu-0.4.2/vllm/spec_decode/ngram_worker.py +176 -0
vllm_npu-0.4.2/vllm/spec_decode/spec_decode_worker.py +472 -0
vllm_npu-0.4.2/vllm/spec_decode/top1_proposer.py +200 -0
vllm_npu-0.4.2/vllm/spec_decode/util.py +228 -0
vllm_npu-0.4.2/vllm/test_utils.py +41 -0
vllm_npu-0.4.2/vllm/transformers_utils/__init__.py +0 -0
vllm_npu-0.4.2/vllm/transformers_utils/config.py +58 -0
vllm_npu-0.4.2/vllm/transformers_utils/configs/__init__.py +16 -0
vllm_npu-0.4.2/vllm/transformers_utils/configs/chatglm.py +68 -0
vllm_npu-0.4.2/vllm/transformers_utils/configs/dbrx.py +278 -0
vllm_npu-0.4.2/vllm/transformers_utils/configs/falcon.py +87 -0
vllm_npu-0.4.2/vllm/transformers_utils/configs/jais.py +236 -0
vllm_npu-0.4.2/vllm/transformers_utils/configs/mpt.py +178 -0
vllm_npu-0.4.2/vllm/transformers_utils/detokenizer.py +313 -0
vllm_npu-0.4.2/vllm/transformers_utils/tokenizer.py +149 -0
vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
vllm_npu-0.4.2/vllm/transformers_utils/tokenizers/__init__.py +5 -0
vllm_npu-0.4.2/vllm/transformers_utils/tokenizers/baichuan.py +255 -0
vllm_npu-0.4.2/vllm/usage/__init__.py +0 -0
vllm_npu-0.4.2/vllm/usage/usage_lib.py +209 -0
vllm_npu-0.4.2/vllm/utils.py +677 -0
vllm_npu-0.4.2/vllm/worker/__init__.py +0 -0
vllm_npu-0.4.2/vllm/worker/cache_engine.py +105 -0
vllm_npu-0.4.2/vllm/worker/cpu_model_runner.py +346 -0
vllm_npu-0.4.2/vllm/worker/cpu_worker.py +321 -0
vllm_npu-0.4.2/vllm/worker/model_runner.py +1168 -0
vllm_npu-0.4.2/vllm/worker/neuron_model_runner.py +196 -0
vllm_npu-0.4.2/vllm/worker/neuron_worker.py +98 -0
vllm_npu-0.4.2/vllm/worker/worker.py +345 -0
vllm_npu-0.4.2/vllm/worker/worker_base.py +146 -0
vllm_npu-0.4.2/vllm_npu.egg-info/PKG-INFO +173 -0
vllm_npu-0.4.2/vllm_npu.egg-info/SOURCES.txt +308 -0
vllm_npu-0.4.2/vllm_npu.egg-info/dependency_links.txt +1 -0
vllm_npu-0.4.2/vllm_npu.egg-info/requires.txt +26 -0
vllm_npu-0.4.2/vllm_npu.egg-info/top_level.txt +1 -0

vllm_npu-0.4.2/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,294 @@
+cmake_minimum_required(VERSION 3.21)
+project(vllm_extensions LANGUAGES CXX)
+option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+#
+# Supported python versions.  These versions will be searched in order, the
+# first match will be selected.  These should be kept in sync with setup.py.
+#
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
+# Supported NVIDIA architectures.
+set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+# Supported AMD GPU architectures.
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
+#
+# Supported/expected torch versions for CUDA/ROCm.
+#
+# Currently, having an incorrect pytorch version results in a warning
+# rather than an error.
+#
+# Note: the CUDA torch version is derived from pyproject.toml and various
+# requirements.txt files and should be kept consistent.  The ROCm torch
+# versions are derived from Dockerfile.rocm
+#
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
+set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
+set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+#
+# Try to find python package with an executable that exactly matches
+# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
+#
+if (VLLM_PYTHON_EXECUTABLE)
+  find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
+else()
+  message(FATAL_ERROR
+    "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
+    " before running cmake configure.")
+endif()
+#
+# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
+#
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+# Ensure the 'nvcc' command is in the PATH
+find_program(NVCC_EXECUTABLE nvcc)
+if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
+    message(FATAL_ERROR "nvcc not found")
+endif()
+#
+# Import torch cmake configuration.
+# Torch also imports CUDA (and partially HIP) languages with some customizations,
+# so there is no need to do this explicitly with check_language/enable_language,
+# etc.
+#
+find_package(Torch REQUIRED)
+#
+# Normally `torch.utils.cpp_extension.CUDAExtension` would add
+# `libtorch_python.so` for linking against an extension. Torch's cmake
+# configuration does not include this library (presumably since the cmake
+# config is used for standalone C++ binaries that link against torch).
+# The `libtorch_python.so` library defines some of the glue code between
+# torch/python via pybind and is required by VLLM extensions for this
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
+#
+find_library(torch_python_LIBRARY torch_python PATHS
+  "${TORCH_INSTALL_PREFIX}/lib")
+#
+# Forward the non-CUDA device extensions to external CMake scripts.
+#
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
+    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+    else()
+        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
+    endif()
+    return()
+endif()
+#
+# Set up GPU language and check the torch version and warn if it isn't
+# what is expected.
+#
+if (NOT HIP_FOUND AND CUDA_FOUND)
+  set(VLLM_GPU_LANG "CUDA")
+  if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
+      "expected for CUDA build, saw ${Torch_VERSION} instead.")
+  endif()
+elseif(HIP_FOUND)
+  set(VLLM_GPU_LANG "HIP")
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
+  # not let cmake recognize .hip files. In order to get cmake to understand the
+  # .hip extension automatically, HIP must be enabled explicitly.
+  enable_language(HIP)
+  # ROCm 5.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
+      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
+  endif()
+  # ROCm 6.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
+      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
+  endif()
+else()
+  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
+endif()
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# the supported versions for the current language.
+# The final set of arches is stored in `VLLM_GPU_ARCHES`.
+#
+override_gpu_arches(VLLM_GPU_ARCHES
+  ${VLLM_GPU_LANG}
+  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
+#
+# Query torch for additional GPU compilation flags for the given
+# `VLLM_GPU_LANG`.
+# The final set of arches is stored in `VLLM_GPU_FLAGS`.
+#
+get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
+#
+# Set nvcc parallelism.
+#
+if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
+endif()
+#
+# Define extension targets
+#
+#
+# _C extension
+#
+set(VLLM_EXT_SRC
+  "csrc/cache_kernels.cu"
+  "csrc/attention/attention_kernels.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/fp8/fp8_cuda_kernels.cu"
+  "csrc/cuda_utils_kernels.cu"
+  "csrc/moe_align_block_size_kernels.cu"
+  "csrc/pybind.cpp")
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_EXT_SRC
+    "csrc/quantization/aqlm/gemm_kernels.cu"
+    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/quantization/marlin/marlin_cuda_kernel.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/custom_all_reduce.cu")
+endif()
+define_gpu_extension_target(
+  _C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  WITH_SOABI)
+#
+# _moe_C extension
+#
+set(VLLM_MOE_EXT_SRC
+  "csrc/moe/moe_ops.cpp"
+  "csrc/moe/topk_softmax_kernels.cu")
+define_gpu_extension_target(
+  _moe_C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_MOE_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  WITH_SOABI)
+#
+# _punica_C extension
+#
+set(VLLM_PUNICA_EXT_SRC
+  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+  "csrc/punica/punica_ops.cc")
+#
+# Copy GPU compilation flags+update for punica
+#
+set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
+#
+# Filter out CUDA architectures < 8.0 for punica.
+#
+if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+  set(VLLM_PUNICA_GPU_ARCHES)
+  foreach(ARCH ${VLLM_GPU_ARCHES})
+    string_to_ver(CODE_VER ${ARCH})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+    endif()
+  endforeach()
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+endif()
+if (VLLM_PUNICA_GPU_ARCHES)
+  define_gpu_extension_target(
+    _punica_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_PUNICA_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+    WITH_SOABI)
+else()
+  message(WARNING "Unable to create _punica_C target because none of the "
+    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
+endif()
+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+  message(STATUS "Enabling C extension.")
+  add_dependencies(default _C)
+endif()
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(default _moe_C)
+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+  # there are supported target arches.
+  if (VLLM_PUNICA_GPU_ARCHES AND
+      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
+    message(STATUS "Enabling punica extension.")
+    add_dependencies(default _punica_C)
+  endif()
+endif()

vllm_npu-0.4.2/LICENSE ADDED Viewed

@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

vllm_npu-0.4.2/MANIFEST.in ADDED Viewed

@@ -0,0 +1,10 @@
+include LICENSE
+include requirements-common.txt
+include requirements-cuda.txt
+include requirements-rocm.txt
+include requirements-neuron.txt
+include requirements-cpu.txt
+include CMakeLists.txt
+recursive-include cmake *
+recursive-include csrc *

vllm_npu-0.4.2/PKG-INFO ADDED Viewed

@@ -0,0 +1,173 @@
+Metadata-Version: 2.2
+Name: vllm_npu
+Version: 0.4.2
+Summary: A high-throughput and memory-efficient inference and serving engine for LLMs
+Home-page: https://github.com/vllm-project/vllm
+Author: vLLM Team
+License: Apache 2.0
+Project-URL: Homepage, https://github.com/vllm-project/vllm
+Project-URL: Documentation, https://vllm.readthedocs.io/en/latest/
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: cmake>=3.21
+Requires-Dist: ninja
+Requires-Dist: psutil
+Requires-Dist: sentencepiece
+Requires-Dist: numpy
+Requires-Dist: requests
+Requires-Dist: py-cpuinfo
+Requires-Dist: transformers>=4.40.0
+Requires-Dist: tokenizers>=0.19.1
+Requires-Dist: fastapi
+Requires-Dist: openai
+Requires-Dist: uvicorn[standard]
+Requires-Dist: pydantic>=2.0
+Requires-Dist: prometheus_client>=0.18.0
+Requires-Dist: prometheus-fastapi-instrumentator>=7.0.0
+Requires-Dist: tiktoken==0.6.0
+Requires-Dist: lm-format-enforcer==0.10.1
+Requires-Dist: typing_extensions
+Requires-Dist: filelock>=3.10.4
+Requires-Dist: ray==2.9.3
+Requires-Dist: pynvml==11.5.0
+Requires-Dist: outlines==0.0.34
+Requires-Dist: npu-vllm==0.4.2
+Provides-Extra: tensorizer
+Requires-Dist: tensorizer==2.9.0; extra == "tensorizer"
+Dynamic: author
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: project-url
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
+    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
+  </picture>
+</p>
+<h3 align="center">
+Easy, fast, and cheap LLM serving for everyone
+</h3>
+<p align="center">
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+</p>
+*Latest News* 🔥
+- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2024/01] Added ROCm 6.0 support to vLLM.
+- [2023/12] Added ROCm 5.7 support to vLLM.
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
+- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
+- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
+- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
+- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
+- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+---
+## About
+vLLM is a fast and easy-to-use library for LLM inference and serving.
+vLLM is fast with:
+- State-of-the-art serving throughput
+- Efficient management of attention key and value memory with **PagedAttention**
+- Continuous batching of incoming requests
+- Fast model execution with CUDA/HIP graph
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
+- Optimized CUDA kernels
+vLLM is flexible and easy to use with:
+- Seamless integration with popular Hugging Face models
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+- Tensor parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+- Support NVIDIA GPUs and AMD GPUs
+- (Experimental) Prefix caching support
+- (Experimental) Multi-lora support
+vLLM seamlessly supports many Hugging Face models, including the following architectures:
+- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
+- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
+- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
+- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
+- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
+- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
+- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
+- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
+- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
+- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
+- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
+- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
+- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
+- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
+- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
+- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
+- LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
+- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
+- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
+- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
+- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
+- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
+- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
+- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
+- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
+- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
+- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
+- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
+- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
+- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
+- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
+- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
+Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+```bash
+pip install vllm
+```
+## Getting Started
+Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
+- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
+- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
+- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+## Contributing
+We welcome and value any contributions and collaborations.
+Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+## Citation
+If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
+```bibtex
+@inproceedings{kwon2023efficient,
+  title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
+  author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
+  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
+  year={2023}
+}
+```