vllm-npu 0.4.2__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (310) hide show
  1. vllm_npu-0.4.2/CMakeLists.txt +294 -0
  2. vllm_npu-0.4.2/LICENSE +201 -0
  3. vllm_npu-0.4.2/MANIFEST.in +10 -0
  4. vllm_npu-0.4.2/PKG-INFO +173 -0
  5. vllm_npu-0.4.2/README.md +118 -0
  6. vllm_npu-0.4.2/cmake/cpu_extension.cmake +90 -0
  7. vllm_npu-0.4.2/cmake/hipify.py +73 -0
  8. vllm_npu-0.4.2/cmake/utils.cmake +354 -0
  9. vllm_npu-0.4.2/csrc/activation_kernels.cu +161 -0
  10. vllm_npu-0.4.2/csrc/attention/attention_dtypes.h +7 -0
  11. vllm_npu-0.4.2/csrc/attention/attention_generic.cuh +64 -0
  12. vllm_npu-0.4.2/csrc/attention/attention_kernels.cu +980 -0
  13. vllm_npu-0.4.2/csrc/attention/attention_utils.cuh +56 -0
  14. vllm_npu-0.4.2/csrc/attention/dtype_bfloat16.cuh +451 -0
  15. vllm_npu-0.4.2/csrc/attention/dtype_float16.cuh +502 -0
  16. vllm_npu-0.4.2/csrc/attention/dtype_float32.cuh +273 -0
  17. vllm_npu-0.4.2/csrc/attention/dtype_fp8.cuh +35 -0
  18. vllm_npu-0.4.2/csrc/cache.h +38 -0
  19. vllm_npu-0.4.2/csrc/cache_kernels.cu +419 -0
  20. vllm_npu-0.4.2/csrc/cpu/activation.cpp +148 -0
  21. vllm_npu-0.4.2/csrc/cpu/attention.cpp +746 -0
  22. vllm_npu-0.4.2/csrc/cpu/cache.cpp +141 -0
  23. vllm_npu-0.4.2/csrc/cpu/cpu_types.hpp +352 -0
  24. vllm_npu-0.4.2/csrc/cpu/layernorm.cpp +117 -0
  25. vllm_npu-0.4.2/csrc/cpu/pos_encoding.cpp +199 -0
  26. vllm_npu-0.4.2/csrc/cpu/pybind.cpp +73 -0
  27. vllm_npu-0.4.2/csrc/cuda_compat.h +38 -0
  28. vllm_npu-0.4.2/csrc/cuda_utils.h +10 -0
  29. vllm_npu-0.4.2/csrc/cuda_utils_kernels.cu +35 -0
  30. vllm_npu-0.4.2/csrc/custom_all_reduce.cu +148 -0
  31. vllm_npu-0.4.2/csrc/custom_all_reduce.cuh +485 -0
  32. vllm_npu-0.4.2/csrc/custom_all_reduce_test.cu +316 -0
  33. vllm_npu-0.4.2/csrc/dispatch_utils.h +37 -0
  34. vllm_npu-0.4.2/csrc/layernorm_kernels.cu +352 -0
  35. vllm_npu-0.4.2/csrc/moe/moe_ops.cpp +7 -0
  36. vllm_npu-0.4.2/csrc/moe/moe_ops.h +9 -0
  37. vllm_npu-0.4.2/csrc/moe/topk_softmax_kernels.cu +499 -0
  38. vllm_npu-0.4.2/csrc/moe_align_block_size_kernels.cu +125 -0
  39. vllm_npu-0.4.2/csrc/ops.h +206 -0
  40. vllm_npu-0.4.2/csrc/pos_encoding_kernels.cu +226 -0
  41. vllm_npu-0.4.2/csrc/punica/LICENSE +217 -0
  42. vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu +5 -0
  43. vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu +5 -0
  44. vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_config.h +162 -0
  45. vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu +5 -0
  46. vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu +5 -0
  47. vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu +5 -0
  48. vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu +5 -0
  49. vllm_npu-0.4.2/csrc/punica/bgmv/bgmv_impl.cuh +297 -0
  50. vllm_npu-0.4.2/csrc/punica/bgmv/generator.py +48 -0
  51. vllm_npu-0.4.2/csrc/punica/bgmv/vec_dtypes.cuh +1324 -0
  52. vllm_npu-0.4.2/csrc/punica/punica_ops.cc +582 -0
  53. vllm_npu-0.4.2/csrc/pybind.cpp +136 -0
  54. vllm_npu-0.4.2/csrc/quantization/aqlm/gemm_kernels.cu +712 -0
  55. vllm_npu-0.4.2/csrc/quantization/awq/dequantize.cuh +87 -0
  56. vllm_npu-0.4.2/csrc/quantization/awq/gemm_kernels.cu +446 -0
  57. vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/hip_float8.h +167 -0
  58. vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/hip_float8_impl.h +316 -0
  59. vllm_npu-0.4.2/csrc/quantization/fp8/amd_detail/quant_utils.cuh +517 -0
  60. vllm_npu-0.4.2/csrc/quantization/fp8/fp8_cuda_kernels.cu +126 -0
  61. vllm_npu-0.4.2/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh +277 -0
  62. vllm_npu-0.4.2/csrc/quantization/gptq/compat.cuh +64 -0
  63. vllm_npu-0.4.2/csrc/quantization/gptq/matrix_view.cuh +274 -0
  64. vllm_npu-0.4.2/csrc/quantization/gptq/q_gemm.cu +2075 -0
  65. vllm_npu-0.4.2/csrc/quantization/gptq/qdq_2.cuh +87 -0
  66. vllm_npu-0.4.2/csrc/quantization/gptq/qdq_3.cuh +141 -0
  67. vllm_npu-0.4.2/csrc/quantization/gptq/qdq_4.cuh +147 -0
  68. vllm_npu-0.4.2/csrc/quantization/gptq/qdq_8.cuh +40 -0
  69. vllm_npu-0.4.2/csrc/quantization/gptq/qdq_util.cuh +60 -0
  70. vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin.cu +1722 -0
  71. vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin.cuh +70 -0
  72. vllm_npu-0.4.2/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu +352 -0
  73. vllm_npu-0.4.2/csrc/quantization/marlin/LICENSE +209 -0
  74. vllm_npu-0.4.2/csrc/quantization/marlin/marlin_cuda_kernel.cu +1138 -0
  75. vllm_npu-0.4.2/csrc/quantization/squeezellm/quant_cuda_kernel.cu +225 -0
  76. vllm_npu-0.4.2/csrc/reduction_utils.cuh +65 -0
  77. vllm_npu-0.4.2/pyproject.toml +67 -0
  78. vllm_npu-0.4.2/requirements-common.txt +20 -0
  79. vllm_npu-0.4.2/requirements-cpu.txt +6 -0
  80. vllm_npu-0.4.2/requirements-cuda.txt +9 -0
  81. vllm_npu-0.4.2/requirements-neuron.txt +7 -0
  82. vllm_npu-0.4.2/requirements-rocm.txt +5 -0
  83. vllm_npu-0.4.2/setup.cfg +4 -0
  84. vllm_npu-0.4.2/setup.py +299 -0
  85. vllm_npu-0.4.2/tests/test_cache_block_hashing.py +93 -0
  86. vllm_npu-0.4.2/tests/test_config.py +39 -0
  87. vllm_npu-0.4.2/tests/test_logger.py +214 -0
  88. vllm_npu-0.4.2/tests/test_logits_processor.py +103 -0
  89. vllm_npu-0.4.2/tests/test_regression.py +58 -0
  90. vllm_npu-0.4.2/tests/test_sampling_params.py +13 -0
  91. vllm_npu-0.4.2/tests/test_sequence.py +124 -0
  92. vllm_npu-0.4.2/vllm/__init__.py +23 -0
  93. vllm_npu-0.4.2/vllm/_custom_ops.py +251 -0
  94. vllm_npu-0.4.2/vllm/attention/__init__.py +13 -0
  95. vllm_npu-0.4.2/vllm/attention/backends/__init__.py +0 -0
  96. vllm_npu-0.4.2/vllm/attention/backends/abstract.py +127 -0
  97. vllm_npu-0.4.2/vllm/attention/backends/flash_attn.py +271 -0
  98. vllm_npu-0.4.2/vllm/attention/backends/flashinfer.py +220 -0
  99. vllm_npu-0.4.2/vllm/attention/backends/rocm_flash_attn.py +374 -0
  100. vllm_npu-0.4.2/vllm/attention/backends/torch_sdpa.py +250 -0
  101. vllm_npu-0.4.2/vllm/attention/backends/xformers.py +393 -0
  102. vllm_npu-0.4.2/vllm/attention/layer.py +56 -0
  103. vllm_npu-0.4.2/vllm/attention/ops/__init__.py +0 -0
  104. vllm_npu-0.4.2/vllm/attention/ops/paged_attn.py +216 -0
  105. vllm_npu-0.4.2/vllm/attention/ops/prefix_prefill.py +792 -0
  106. vllm_npu-0.4.2/vllm/attention/ops/triton_flash_attention.py +810 -0
  107. vllm_npu-0.4.2/vllm/attention/selector.py +91 -0
  108. vllm_npu-0.4.2/vllm/block.py +84 -0
  109. vllm_npu-0.4.2/vllm/config.py +1225 -0
  110. vllm_npu-0.4.2/vllm/core/__init__.py +0 -0
  111. vllm_npu-0.4.2/vllm/core/block/__init__.py +0 -0
  112. vllm_npu-0.4.2/vllm/core/block/block_table.py +295 -0
  113. vllm_npu-0.4.2/vllm/core/block/common.py +199 -0
  114. vllm_npu-0.4.2/vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  115. vllm_npu-0.4.2/vllm/core/block/interfaces.py +205 -0
  116. vllm_npu-0.4.2/vllm/core/block/naive_block.py +318 -0
  117. vllm_npu-0.4.2/vllm/core/block/prefix_caching_block.py +606 -0
  118. vllm_npu-0.4.2/vllm/core/block_manager_v1.py +625 -0
  119. vllm_npu-0.4.2/vllm/core/block_manager_v2.py +258 -0
  120. vllm_npu-0.4.2/vllm/core/evictor_v1.py +105 -0
  121. vllm_npu-0.4.2/vllm/core/evictor_v2.py +127 -0
  122. vllm_npu-0.4.2/vllm/core/interfaces.py +113 -0
  123. vllm_npu-0.4.2/vllm/core/policy.py +45 -0
  124. vllm_npu-0.4.2/vllm/core/scheduler.py +1163 -0
  125. vllm_npu-0.4.2/vllm/distributed/__init__.py +3 -0
  126. vllm_npu-0.4.2/vllm/distributed/communication_op.py +237 -0
  127. vllm_npu-0.4.2/vllm/distributed/device_communicators/__init__.py +0 -0
  128. vllm_npu-0.4.2/vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  129. vllm_npu-0.4.2/vllm/distributed/device_communicators/pynccl.py +287 -0
  130. vllm_npu-0.4.2/vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  131. vllm_npu-0.4.2/vllm/distributed/parallel_state.py +339 -0
  132. vllm_npu-0.4.2/vllm/distributed/utils.py +136 -0
  133. vllm_npu-0.4.2/vllm/engine/__init__.py +0 -0
  134. vllm_npu-0.4.2/vllm/engine/arg_utils.py +649 -0
  135. vllm_npu-0.4.2/vllm/engine/async_llm_engine.py +737 -0
  136. vllm_npu-0.4.2/vllm/engine/llm_engine.py +784 -0
  137. vllm_npu-0.4.2/vllm/engine/metrics.py +368 -0
  138. vllm_npu-0.4.2/vllm/engine/output_processor/__init__.py +0 -0
  139. vllm_npu-0.4.2/vllm/engine/output_processor/interfaces.py +76 -0
  140. vllm_npu-0.4.2/vllm/engine/output_processor/multi_step.py +142 -0
  141. vllm_npu-0.4.2/vllm/engine/output_processor/single_step.py +284 -0
  142. vllm_npu-0.4.2/vllm/engine/output_processor/stop_checker.py +101 -0
  143. vllm_npu-0.4.2/vllm/engine/output_processor/util.py +19 -0
  144. vllm_npu-0.4.2/vllm/entrypoints/__init__.py +0 -0
  145. vllm_npu-0.4.2/vllm/entrypoints/api_server.py +119 -0
  146. vllm_npu-0.4.2/vllm/entrypoints/llm.py +259 -0
  147. vllm_npu-0.4.2/vllm/entrypoints/openai/__init__.py +0 -0
  148. vllm_npu-0.4.2/vllm/entrypoints/openai/api_server.py +186 -0
  149. vllm_npu-0.4.2/vllm/entrypoints/openai/cli_args.py +115 -0
  150. vllm_npu-0.4.2/vllm/entrypoints/openai/protocol.py +460 -0
  151. vllm_npu-0.4.2/vllm/entrypoints/openai/serving_chat.py +392 -0
  152. vllm_npu-0.4.2/vllm/entrypoints/openai/serving_completion.py +347 -0
  153. vllm_npu-0.4.2/vllm/entrypoints/openai/serving_engine.py +234 -0
  154. vllm_npu-0.4.2/vllm/envs.py +217 -0
  155. vllm_npu-0.4.2/vllm/executor/__init__.py +0 -0
  156. vllm_npu-0.4.2/vllm/executor/cpu_executor.py +152 -0
  157. vllm_npu-0.4.2/vllm/executor/distributed_gpu_executor.py +115 -0
  158. vllm_npu-0.4.2/vllm/executor/executor_base.py +115 -0
  159. vllm_npu-0.4.2/vllm/executor/gpu_executor.py +150 -0
  160. vllm_npu-0.4.2/vllm/executor/multiproc_worker_utils.py +263 -0
  161. vllm_npu-0.4.2/vllm/executor/neuron_executor.py +91 -0
  162. vllm_npu-0.4.2/vllm/executor/ray_gpu_executor.py +327 -0
  163. vllm_npu-0.4.2/vllm/executor/ray_utils.py +119 -0
  164. vllm_npu-0.4.2/vllm/logger.py +153 -0
  165. vllm_npu-0.4.2/vllm/logging/__init__.py +5 -0
  166. vllm_npu-0.4.2/vllm/logging/formatter.py +15 -0
  167. vllm_npu-0.4.2/vllm/lora/__init__.py +0 -0
  168. vllm_npu-0.4.2/vllm/lora/fully_sharded_layers.py +262 -0
  169. vllm_npu-0.4.2/vllm/lora/layers.py +1181 -0
  170. vllm_npu-0.4.2/vllm/lora/lora.py +167 -0
  171. vllm_npu-0.4.2/vllm/lora/models.py +645 -0
  172. vllm_npu-0.4.2/vllm/lora/punica.py +213 -0
  173. vllm_npu-0.4.2/vllm/lora/request.py +32 -0
  174. vllm_npu-0.4.2/vllm/lora/utils.py +98 -0
  175. vllm_npu-0.4.2/vllm/lora/worker_manager.py +251 -0
  176. vllm_npu-0.4.2/vllm/model_executor/__init__.py +7 -0
  177. vllm_npu-0.4.2/vllm/model_executor/guided_decoding/__init__.py +25 -0
  178. vllm_npu-0.4.2/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  179. vllm_npu-0.4.2/vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  180. vllm_npu-0.4.2/vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  181. vllm_npu-0.4.2/vllm/model_executor/layers/__init__.py +0 -0
  182. vllm_npu-0.4.2/vllm/model_executor/layers/activation.py +173 -0
  183. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  184. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  185. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  186. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  187. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  188. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  189. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  190. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  191. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  192. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  193. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  194. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  195. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  196. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  197. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  198. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  199. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  200. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  201. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  202. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  203. vllm_npu-0.4.2/vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  204. vllm_npu-0.4.2/vllm/model_executor/layers/layernorm.py +71 -0
  205. vllm_npu-0.4.2/vllm/model_executor/layers/linear.py +709 -0
  206. vllm_npu-0.4.2/vllm/model_executor/layers/logits_processor.py +115 -0
  207. vllm_npu-0.4.2/vllm/model_executor/layers/ops/__init__.py +0 -0
  208. vllm_npu-0.4.2/vllm/model_executor/layers/ops/rand.py +157 -0
  209. vllm_npu-0.4.2/vllm/model_executor/layers/ops/sample.py +406 -0
  210. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/__init__.py +35 -0
  211. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/aqlm.py +376 -0
  212. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/awq.py +175 -0
  213. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/base_config.py +97 -0
  214. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/fp8.py +265 -0
  215. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/gptq.py +224 -0
  216. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  217. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/marlin.py +227 -0
  218. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/schema.py +84 -0
  219. vllm_npu-0.4.2/vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  220. vllm_npu-0.4.2/vllm/model_executor/layers/rejection_sampler.py +405 -0
  221. vllm_npu-0.4.2/vllm/model_executor/layers/rotary_embedding.py +525 -0
  222. vllm_npu-0.4.2/vllm/model_executor/layers/sampler.py +1051 -0
  223. vllm_npu-0.4.2/vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  224. vllm_npu-0.4.2/vllm/model_executor/model_loader/__init__.py +30 -0
  225. vllm_npu-0.4.2/vllm/model_executor/model_loader/loader.py +362 -0
  226. vllm_npu-0.4.2/vllm/model_executor/model_loader/neuron.py +136 -0
  227. vllm_npu-0.4.2/vllm/model_executor/model_loader/tensorizer.py +368 -0
  228. vllm_npu-0.4.2/vllm/model_executor/model_loader/utils.py +41 -0
  229. vllm_npu-0.4.2/vllm/model_executor/model_loader/weight_utils.py +372 -0
  230. vllm_npu-0.4.2/vllm/model_executor/models/__init__.py +119 -0
  231. vllm_npu-0.4.2/vllm/model_executor/models/baichuan.py +410 -0
  232. vllm_npu-0.4.2/vllm/model_executor/models/bloom.py +327 -0
  233. vllm_npu-0.4.2/vllm/model_executor/models/chatglm.py +386 -0
  234. vllm_npu-0.4.2/vllm/model_executor/models/commandr.py +373 -0
  235. vllm_npu-0.4.2/vllm/model_executor/models/dbrx.py +413 -0
  236. vllm_npu-0.4.2/vllm/model_executor/models/decilm.py +122 -0
  237. vllm_npu-0.4.2/vllm/model_executor/models/deepseek.py +438 -0
  238. vllm_npu-0.4.2/vllm/model_executor/models/falcon.py +444 -0
  239. vllm_npu-0.4.2/vllm/model_executor/models/gemma.py +393 -0
  240. vllm_npu-0.4.2/vllm/model_executor/models/gpt2.py +266 -0
  241. vllm_npu-0.4.2/vllm/model_executor/models/gpt_bigcode.py +274 -0
  242. vllm_npu-0.4.2/vllm/model_executor/models/gpt_j.py +281 -0
  243. vllm_npu-0.4.2/vllm/model_executor/models/gpt_neox.py +295 -0
  244. vllm_npu-0.4.2/vllm/model_executor/models/internlm2.py +323 -0
  245. vllm_npu-0.4.2/vllm/model_executor/models/jais.py +333 -0
  246. vllm_npu-0.4.2/vllm/model_executor/models/llama.py +442 -0
  247. vllm_npu-0.4.2/vllm/model_executor/models/llava.py +239 -0
  248. vllm_npu-0.4.2/vllm/model_executor/models/minicpm.py +531 -0
  249. vllm_npu-0.4.2/vllm/model_executor/models/mixtral.py +583 -0
  250. vllm_npu-0.4.2/vllm/model_executor/models/mixtral_quant.py +404 -0
  251. vllm_npu-0.4.2/vllm/model_executor/models/mpt.py +295 -0
  252. vllm_npu-0.4.2/vllm/model_executor/models/olmo.py +356 -0
  253. vllm_npu-0.4.2/vllm/model_executor/models/opt.py +349 -0
  254. vllm_npu-0.4.2/vllm/model_executor/models/orion.py +319 -0
  255. vllm_npu-0.4.2/vllm/model_executor/models/phi.py +300 -0
  256. vllm_npu-0.4.2/vllm/model_executor/models/qwen.py +284 -0
  257. vllm_npu-0.4.2/vllm/model_executor/models/qwen2.py +367 -0
  258. vllm_npu-0.4.2/vllm/model_executor/models/qwen2_moe.py +447 -0
  259. vllm_npu-0.4.2/vllm/model_executor/models/stablelm.py +301 -0
  260. vllm_npu-0.4.2/vllm/model_executor/models/starcoder2.py +302 -0
  261. vllm_npu-0.4.2/vllm/model_executor/models/xverse.py +366 -0
  262. vllm_npu-0.4.2/vllm/model_executor/sampling_metadata.py +588 -0
  263. vllm_npu-0.4.2/vllm/model_executor/utils.py +35 -0
  264. vllm_npu-0.4.2/vllm/outputs.py +150 -0
  265. vllm_npu-0.4.2/vllm/py.typed +2 -0
  266. vllm_npu-0.4.2/vllm/sampling_params.py +340 -0
  267. vllm_npu-0.4.2/vllm/sequence.py +766 -0
  268. vllm_npu-0.4.2/vllm/spec_decode/__init__.py +0 -0
  269. vllm_npu-0.4.2/vllm/spec_decode/batch_expansion.py +397 -0
  270. vllm_npu-0.4.2/vllm/spec_decode/interfaces.py +73 -0
  271. vllm_npu-0.4.2/vllm/spec_decode/metrics.py +191 -0
  272. vllm_npu-0.4.2/vllm/spec_decode/multi_step_worker.py +203 -0
  273. vllm_npu-0.4.2/vllm/spec_decode/ngram_worker.py +176 -0
  274. vllm_npu-0.4.2/vllm/spec_decode/spec_decode_worker.py +472 -0
  275. vllm_npu-0.4.2/vllm/spec_decode/top1_proposer.py +200 -0
  276. vllm_npu-0.4.2/vllm/spec_decode/util.py +228 -0
  277. vllm_npu-0.4.2/vllm/test_utils.py +41 -0
  278. vllm_npu-0.4.2/vllm/transformers_utils/__init__.py +0 -0
  279. vllm_npu-0.4.2/vllm/transformers_utils/config.py +58 -0
  280. vllm_npu-0.4.2/vllm/transformers_utils/configs/__init__.py +16 -0
  281. vllm_npu-0.4.2/vllm/transformers_utils/configs/chatglm.py +68 -0
  282. vllm_npu-0.4.2/vllm/transformers_utils/configs/dbrx.py +278 -0
  283. vllm_npu-0.4.2/vllm/transformers_utils/configs/falcon.py +87 -0
  284. vllm_npu-0.4.2/vllm/transformers_utils/configs/jais.py +236 -0
  285. vllm_npu-0.4.2/vllm/transformers_utils/configs/mpt.py +178 -0
  286. vllm_npu-0.4.2/vllm/transformers_utils/detokenizer.py +313 -0
  287. vllm_npu-0.4.2/vllm/transformers_utils/tokenizer.py +149 -0
  288. vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  289. vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  290. vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  291. vllm_npu-0.4.2/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  292. vllm_npu-0.4.2/vllm/transformers_utils/tokenizers/__init__.py +5 -0
  293. vllm_npu-0.4.2/vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  294. vllm_npu-0.4.2/vllm/usage/__init__.py +0 -0
  295. vllm_npu-0.4.2/vllm/usage/usage_lib.py +209 -0
  296. vllm_npu-0.4.2/vllm/utils.py +677 -0
  297. vllm_npu-0.4.2/vllm/worker/__init__.py +0 -0
  298. vllm_npu-0.4.2/vllm/worker/cache_engine.py +105 -0
  299. vllm_npu-0.4.2/vllm/worker/cpu_model_runner.py +346 -0
  300. vllm_npu-0.4.2/vllm/worker/cpu_worker.py +321 -0
  301. vllm_npu-0.4.2/vllm/worker/model_runner.py +1168 -0
  302. vllm_npu-0.4.2/vllm/worker/neuron_model_runner.py +196 -0
  303. vllm_npu-0.4.2/vllm/worker/neuron_worker.py +98 -0
  304. vllm_npu-0.4.2/vllm/worker/worker.py +345 -0
  305. vllm_npu-0.4.2/vllm/worker/worker_base.py +146 -0
  306. vllm_npu-0.4.2/vllm_npu.egg-info/PKG-INFO +173 -0
  307. vllm_npu-0.4.2/vllm_npu.egg-info/SOURCES.txt +308 -0
  308. vllm_npu-0.4.2/vllm_npu.egg-info/dependency_links.txt +1 -0
  309. vllm_npu-0.4.2/vllm_npu.egg-info/requires.txt +26 -0
  310. vllm_npu-0.4.2/vllm_npu.egg-info/top_level.txt +1 -0
@@ -0,0 +1,294 @@
1
+ cmake_minimum_required(VERSION 3.21)
2
+
3
+ project(vllm_extensions LANGUAGES CXX)
4
+
5
+ option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
6
+
7
+ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
8
+ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
9
+
10
+ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
11
+
12
+ #
13
+ # Supported python versions. These versions will be searched in order, the
14
+ # first match will be selected. These should be kept in sync with setup.py.
15
+ #
16
+ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
17
+
18
+ # Supported NVIDIA architectures.
19
+ set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
20
+
21
+ # Supported AMD GPU architectures.
22
+ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
23
+
24
+ #
25
+ # Supported/expected torch versions for CUDA/ROCm.
26
+ #
27
+ # Currently, having an incorrect pytorch version results in a warning
28
+ # rather than an error.
29
+ #
30
+ # Note: the CUDA torch version is derived from pyproject.toml and various
31
+ # requirements.txt files and should be kept consistent. The ROCm torch
32
+ # versions are derived from Dockerfile.rocm
33
+ #
34
+ set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
35
+ set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
36
+ set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
37
+
38
+ #
39
+ # Try to find python package with an executable that exactly matches
40
+ # `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
41
+ #
42
+ if (VLLM_PYTHON_EXECUTABLE)
43
+ find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
44
+ else()
45
+ message(FATAL_ERROR
46
+ "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
47
+ " before running cmake configure.")
48
+ endif()
49
+
50
+ #
51
+ # Update cmake's `CMAKE_PREFIX_PATH` with torch location.
52
+ #
53
+ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
54
+
55
+ # Ensure the 'nvcc' command is in the PATH
56
+ find_program(NVCC_EXECUTABLE nvcc)
57
+ if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
58
+ message(FATAL_ERROR "nvcc not found")
59
+ endif()
60
+
61
+ #
62
+ # Import torch cmake configuration.
63
+ # Torch also imports CUDA (and partially HIP) languages with some customizations,
64
+ # so there is no need to do this explicitly with check_language/enable_language,
65
+ # etc.
66
+ #
67
+ find_package(Torch REQUIRED)
68
+
69
+ #
70
+ # Normally `torch.utils.cpp_extension.CUDAExtension` would add
71
+ # `libtorch_python.so` for linking against an extension. Torch's cmake
72
+ # configuration does not include this library (presumably since the cmake
73
+ # config is used for standalone C++ binaries that link against torch).
74
+ # The `libtorch_python.so` library defines some of the glue code between
75
+ # torch/python via pybind and is required by VLLM extensions for this
76
+ # reason. So, add it by manually with `find_library` using torch's
77
+ # installed library path.
78
+ #
79
+ find_library(torch_python_LIBRARY torch_python PATHS
80
+ "${TORCH_INSTALL_PREFIX}/lib")
81
+
82
+ #
83
+ # Forward the non-CUDA device extensions to external CMake scripts.
84
+ #
85
+ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
86
+ NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
87
+ if (VLLM_TARGET_DEVICE STREQUAL "cpu")
88
+ include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
89
+ else()
90
+ message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
91
+ endif()
92
+ return()
93
+ endif()
94
+
95
+ #
96
+ # Set up GPU language and check the torch version and warn if it isn't
97
+ # what is expected.
98
+ #
99
+ if (NOT HIP_FOUND AND CUDA_FOUND)
100
+ set(VLLM_GPU_LANG "CUDA")
101
+
102
+ if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
103
+ message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
104
+ "expected for CUDA build, saw ${Torch_VERSION} instead.")
105
+ endif()
106
+ elseif(HIP_FOUND)
107
+ set(VLLM_GPU_LANG "HIP")
108
+
109
+ # Importing torch recognizes and sets up some HIP/ROCm configuration but does
110
+ # not let cmake recognize .hip files. In order to get cmake to understand the
111
+ # .hip extension automatically, HIP must be enabled explicitly.
112
+ enable_language(HIP)
113
+
114
+ # ROCm 5.x
115
+ if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
116
+ NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
117
+ message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
118
+ "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
119
+ endif()
120
+
121
+ # ROCm 6.x
122
+ if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
123
+ NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
124
+ message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
125
+ "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
126
+ endif()
127
+ else()
128
+ message(FATAL_ERROR "Can't find CUDA or HIP installation.")
129
+ endif()
130
+
131
+ #
132
+ # Override the GPU architectures detected by cmake/torch and filter them by
133
+ # the supported versions for the current language.
134
+ # The final set of arches is stored in `VLLM_GPU_ARCHES`.
135
+ #
136
+ override_gpu_arches(VLLM_GPU_ARCHES
137
+ ${VLLM_GPU_LANG}
138
+ "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
139
+
140
+ #
141
+ # Query torch for additional GPU compilation flags for the given
142
+ # `VLLM_GPU_LANG`.
143
+ # The final set of arches is stored in `VLLM_GPU_FLAGS`.
144
+ #
145
+ get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
146
+
147
+ #
148
+ # Set nvcc parallelism.
149
+ #
150
+ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
151
+ list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
152
+ endif()
153
+
154
+ #
155
+ # Define extension targets
156
+ #
157
+
158
+ #
159
+ # _C extension
160
+ #
161
+
162
+ set(VLLM_EXT_SRC
163
+ "csrc/cache_kernels.cu"
164
+ "csrc/attention/attention_kernels.cu"
165
+ "csrc/pos_encoding_kernels.cu"
166
+ "csrc/activation_kernels.cu"
167
+ "csrc/layernorm_kernels.cu"
168
+ "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
169
+ "csrc/quantization/gptq/q_gemm.cu"
170
+ "csrc/quantization/fp8/fp8_cuda_kernels.cu"
171
+ "csrc/cuda_utils_kernels.cu"
172
+ "csrc/moe_align_block_size_kernels.cu"
173
+ "csrc/pybind.cpp")
174
+
175
+ if(VLLM_GPU_LANG STREQUAL "CUDA")
176
+ list(APPEND VLLM_EXT_SRC
177
+ "csrc/quantization/aqlm/gemm_kernels.cu"
178
+ "csrc/quantization/awq/gemm_kernels.cu"
179
+ "csrc/quantization/marlin/marlin_cuda_kernel.cu"
180
+ "csrc/quantization/gptq_marlin/gptq_marlin.cu"
181
+ "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
182
+ "csrc/custom_all_reduce.cu")
183
+ endif()
184
+
185
+ define_gpu_extension_target(
186
+ _C
187
+ DESTINATION vllm
188
+ LANGUAGE ${VLLM_GPU_LANG}
189
+ SOURCES ${VLLM_EXT_SRC}
190
+ COMPILE_FLAGS ${VLLM_GPU_FLAGS}
191
+ ARCHITECTURES ${VLLM_GPU_ARCHES}
192
+ WITH_SOABI)
193
+
194
+ #
195
+ # _moe_C extension
196
+ #
197
+
198
+ set(VLLM_MOE_EXT_SRC
199
+ "csrc/moe/moe_ops.cpp"
200
+ "csrc/moe/topk_softmax_kernels.cu")
201
+
202
+ define_gpu_extension_target(
203
+ _moe_C
204
+ DESTINATION vllm
205
+ LANGUAGE ${VLLM_GPU_LANG}
206
+ SOURCES ${VLLM_MOE_EXT_SRC}
207
+ COMPILE_FLAGS ${VLLM_GPU_FLAGS}
208
+ ARCHITECTURES ${VLLM_GPU_ARCHES}
209
+ WITH_SOABI)
210
+
211
+ #
212
+ # _punica_C extension
213
+ #
214
+
215
+ set(VLLM_PUNICA_EXT_SRC
216
+ "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
217
+ "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
218
+ "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
219
+ "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
220
+ "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
221
+ "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
222
+ "csrc/punica/punica_ops.cc")
223
+
224
+ #
225
+ # Copy GPU compilation flags+update for punica
226
+ #
227
+ set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
228
+ list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
229
+ "-D__CUDA_NO_HALF_OPERATORS__"
230
+ "-D__CUDA_NO_HALF_CONVERSIONS__"
231
+ "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
232
+ "-D__CUDA_NO_HALF2_OPERATORS__")
233
+
234
+ #
235
+ # Filter out CUDA architectures < 8.0 for punica.
236
+ #
237
+ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
238
+ set(VLLM_PUNICA_GPU_ARCHES)
239
+ foreach(ARCH ${VLLM_GPU_ARCHES})
240
+ string_to_ver(CODE_VER ${ARCH})
241
+ if (CODE_VER GREATER_EQUAL 8.0)
242
+ list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
243
+ endif()
244
+ endforeach()
245
+ message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
246
+ endif()
247
+
248
+ if (VLLM_PUNICA_GPU_ARCHES)
249
+ define_gpu_extension_target(
250
+ _punica_C
251
+ DESTINATION vllm
252
+ LANGUAGE ${VLLM_GPU_LANG}
253
+ SOURCES ${VLLM_PUNICA_EXT_SRC}
254
+ COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
255
+ ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
256
+ WITH_SOABI)
257
+ else()
258
+ message(WARNING "Unable to create _punica_C target because none of the "
259
+ "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
260
+ endif()
261
+
262
+ #
263
+ # Add the `default` target which detects which extensions should be
264
+ # built based on platform/architecture. This is the same logic that
265
+ # setup.py uses to select which extensions should be built and should
266
+ # be kept in sync.
267
+ #
268
+ # The `default` target makes direct use of cmake easier since knowledge
269
+ # of which extensions are supported has been factored in, e.g.
270
+ #
271
+ # mkdir build && cd build
272
+ # cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
273
+ # cmake --build . --target default
274
+ #
275
+ add_custom_target(default)
276
+
277
+ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
278
+ message(STATUS "Enabling C extension.")
279
+ add_dependencies(default _C)
280
+ endif()
281
+
282
+ if(VLLM_GPU_LANG STREQUAL "CUDA")
283
+ message(STATUS "Enabling moe extension.")
284
+ add_dependencies(default _moe_C)
285
+
286
+ # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
287
+ # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
288
+ # there are supported target arches.
289
+ if (VLLM_PUNICA_GPU_ARCHES AND
290
+ (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
291
+ message(STATUS "Enabling punica extension.")
292
+ add_dependencies(default _punica_C)
293
+ endif()
294
+ endif()
vllm_npu-0.4.2/LICENSE ADDED
@@ -0,0 +1,201 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
@@ -0,0 +1,10 @@
1
+ include LICENSE
2
+ include requirements-common.txt
3
+ include requirements-cuda.txt
4
+ include requirements-rocm.txt
5
+ include requirements-neuron.txt
6
+ include requirements-cpu.txt
7
+ include CMakeLists.txt
8
+
9
+ recursive-include cmake *
10
+ recursive-include csrc *
@@ -0,0 +1,173 @@
1
+ Metadata-Version: 2.2
2
+ Name: vllm_npu
3
+ Version: 0.4.2
4
+ Summary: A high-throughput and memory-efficient inference and serving engine for LLMs
5
+ Home-page: https://github.com/vllm-project/vllm
6
+ Author: vLLM Team
7
+ License: Apache 2.0
8
+ Project-URL: Homepage, https://github.com/vllm-project/vllm
9
+ Project-URL: Documentation, https://vllm.readthedocs.io/en/latest/
10
+ Classifier: Programming Language :: Python :: 3.8
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: cmake>=3.21
20
+ Requires-Dist: ninja
21
+ Requires-Dist: psutil
22
+ Requires-Dist: sentencepiece
23
+ Requires-Dist: numpy
24
+ Requires-Dist: requests
25
+ Requires-Dist: py-cpuinfo
26
+ Requires-Dist: transformers>=4.40.0
27
+ Requires-Dist: tokenizers>=0.19.1
28
+ Requires-Dist: fastapi
29
+ Requires-Dist: openai
30
+ Requires-Dist: uvicorn[standard]
31
+ Requires-Dist: pydantic>=2.0
32
+ Requires-Dist: prometheus_client>=0.18.0
33
+ Requires-Dist: prometheus-fastapi-instrumentator>=7.0.0
34
+ Requires-Dist: tiktoken==0.6.0
35
+ Requires-Dist: lm-format-enforcer==0.10.1
36
+ Requires-Dist: typing_extensions
37
+ Requires-Dist: filelock>=3.10.4
38
+ Requires-Dist: ray==2.9.3
39
+ Requires-Dist: pynvml==11.5.0
40
+ Requires-Dist: outlines==0.0.34
41
+ Requires-Dist: npu-vllm==0.4.2
42
+ Provides-Extra: tensorizer
43
+ Requires-Dist: tensorizer==2.9.0; extra == "tensorizer"
44
+ Dynamic: author
45
+ Dynamic: classifier
46
+ Dynamic: description
47
+ Dynamic: description-content-type
48
+ Dynamic: home-page
49
+ Dynamic: license
50
+ Dynamic: project-url
51
+ Dynamic: provides-extra
52
+ Dynamic: requires-dist
53
+ Dynamic: requires-python
54
+ Dynamic: summary
55
+
56
+ <p align="center">
57
+ <picture>
58
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
59
+ <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
60
+ </picture>
61
+ </p>
62
+
63
+ <h3 align="center">
64
+ Easy, fast, and cheap LLM serving for everyone
65
+ </h3>
66
+
67
+ <p align="center">
68
+ | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
69
+
70
+ </p>
71
+
72
+ *Latest News* 🔥
73
+ - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
74
+ - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
75
+ - [2024/01] Added ROCm 6.0 support to vLLM.
76
+ - [2023/12] Added ROCm 5.7 support to vLLM.
77
+ - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
78
+ - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
79
+ - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
80
+ - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
81
+ - [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
82
+ - [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
83
+ - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
84
+
85
+ ---
86
+ ## About
87
+ vLLM is a fast and easy-to-use library for LLM inference and serving.
88
+
89
+ vLLM is fast with:
90
+
91
+ - State-of-the-art serving throughput
92
+ - Efficient management of attention key and value memory with **PagedAttention**
93
+ - Continuous batching of incoming requests
94
+ - Fast model execution with CUDA/HIP graph
95
+ - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
96
+ - Optimized CUDA kernels
97
+
98
+ vLLM is flexible and easy to use with:
99
+
100
+ - Seamless integration with popular Hugging Face models
101
+ - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
102
+ - Tensor parallelism support for distributed inference
103
+ - Streaming outputs
104
+ - OpenAI-compatible API server
105
+ - Support NVIDIA GPUs and AMD GPUs
106
+ - (Experimental) Prefix caching support
107
+ - (Experimental) Multi-lora support
108
+
109
+ vLLM seamlessly supports many Hugging Face models, including the following architectures:
110
+
111
+ - Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
112
+ - Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
113
+ - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
114
+ - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
115
+ - Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
116
+ - DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
117
+ - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
118
+ - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
119
+ - Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
120
+ - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
121
+ - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
122
+ - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
123
+ - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
124
+ - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
125
+ - InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
126
+ - Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
127
+ - LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
128
+ - MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
129
+ - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
130
+ - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
131
+ - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
132
+ - OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
133
+ - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
134
+ - Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
135
+ - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
136
+ - Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
137
+ - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
138
+ - Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
139
+ - Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
140
+ - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
141
+ - Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
142
+ - Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
143
+ - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
144
+
145
+ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
146
+
147
+ ```bash
148
+ pip install vllm
149
+ ```
150
+
151
+ ## Getting Started
152
+
153
+ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
154
+ - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
155
+ - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
156
+ - [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
157
+
158
+ ## Contributing
159
+
160
+ We welcome and value any contributions and collaborations.
161
+ Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
162
+
163
+ ## Citation
164
+
165
+ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
166
+ ```bibtex
167
+ @inproceedings{kwon2023efficient,
168
+ title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
169
+ author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
170
+ booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
171
+ year={2023}
172
+ }
173
+ ```