vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,146 @@
|
|
1
|
+
import importlib
|
2
|
+
import os
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import Dict, List, Set, Tuple
|
5
|
+
|
6
|
+
from vllm.logger import init_logger
|
7
|
+
from vllm.lora.request import LoRARequest
|
8
|
+
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
9
|
+
from vllm.utils import (enable_trace_function_call_for_thread,
|
10
|
+
update_environment_variables)
|
11
|
+
|
12
|
+
logger = init_logger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
class WorkerBase(ABC):
|
16
|
+
"""Worker interface that allows vLLM to cleanly separate implementations for
|
17
|
+
different hardware.
|
18
|
+
"""
|
19
|
+
|
20
|
+
@abstractmethod
|
21
|
+
def init_device(self) -> None:
|
22
|
+
"""Initialize device state, such as loading the model or other on-device
|
23
|
+
memory allocations.
|
24
|
+
"""
|
25
|
+
raise NotImplementedError
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
29
|
+
"""Determine the number of available blocks for the GPU KV cache and
|
30
|
+
swappable CPU KV cache.
|
31
|
+
|
32
|
+
The implementation may run profiling or other heuristics to determine
|
33
|
+
the size of caches.
|
34
|
+
|
35
|
+
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
36
|
+
are blocks that are "active" on the device and can be appended to.
|
37
|
+
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
38
|
+
appended to.
|
39
|
+
"""
|
40
|
+
raise NotImplementedError
|
41
|
+
|
42
|
+
@abstractmethod
|
43
|
+
def initialize_cache(self, num_gpu_blocks: int,
|
44
|
+
num_cpu_blocks: int) -> None:
|
45
|
+
"""Initialize the KV cache with the given size in blocks.
|
46
|
+
"""
|
47
|
+
raise NotImplementedError
|
48
|
+
|
49
|
+
@abstractmethod
|
50
|
+
def execute_model(
|
51
|
+
self,
|
52
|
+
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
53
|
+
"""Executes at least one model step on the given sequences, unless no
|
54
|
+
sequences are provided."""
|
55
|
+
raise NotImplementedError
|
56
|
+
|
57
|
+
@abstractmethod
|
58
|
+
def get_cache_block_size_bytes(self) -> int:
|
59
|
+
"""Return the size of a single cache block, in bytes. Used in
|
60
|
+
speculative decoding.
|
61
|
+
"""
|
62
|
+
raise NotImplementedError
|
63
|
+
|
64
|
+
@abstractmethod
|
65
|
+
def add_lora(self, lora_request: LoRARequest) -> bool:
|
66
|
+
raise NotImplementedError
|
67
|
+
|
68
|
+
@abstractmethod
|
69
|
+
def remove_lora(self, lora_id: int) -> bool:
|
70
|
+
raise NotImplementedError
|
71
|
+
|
72
|
+
@abstractmethod
|
73
|
+
def list_loras(self) -> Set[int]:
|
74
|
+
raise NotImplementedError
|
75
|
+
|
76
|
+
|
77
|
+
class LoraNotSupportedWorkerBase(WorkerBase):
|
78
|
+
"""Partial implementation of WorkerBase that raises exceptions when LoRA
|
79
|
+
methods are invoked.
|
80
|
+
"""
|
81
|
+
|
82
|
+
def add_lora(self, lora_request: LoRARequest) -> bool:
|
83
|
+
raise ValueError(f"{type(self)} does not support LoRA")
|
84
|
+
|
85
|
+
def remove_lora(self, lora_id: int) -> bool:
|
86
|
+
raise ValueError(f"{type(self)} does not support LoRA")
|
87
|
+
|
88
|
+
def list_loras(self) -> Set[int]:
|
89
|
+
raise ValueError(f"{type(self)} does not support LoRA")
|
90
|
+
|
91
|
+
|
92
|
+
class WorkerWrapperBase:
|
93
|
+
"""
|
94
|
+
The whole point of this class is to lazily initialize the worker.
|
95
|
+
We first instantiate the WorkerWrapper, which remembers the worker module
|
96
|
+
and class name. Then, when we call `update_environment_variables`, and the
|
97
|
+
real initialization happens in `init_worker`.
|
98
|
+
"""
|
99
|
+
|
100
|
+
def __init__(self,
|
101
|
+
worker_module_name=None,
|
102
|
+
worker_class_name=None,
|
103
|
+
trust_remote_code: bool = False) -> None:
|
104
|
+
self.worker_module_name = worker_module_name
|
105
|
+
self.worker_class_name = worker_class_name
|
106
|
+
self.worker = None
|
107
|
+
if trust_remote_code:
|
108
|
+
# note: lazy import to avoid importing torch before initializing
|
109
|
+
from vllm.utils import init_cached_hf_modules
|
110
|
+
init_cached_hf_modules()
|
111
|
+
|
112
|
+
@staticmethod
|
113
|
+
def update_environment_variables(envs: Dict[str, str]) -> None:
|
114
|
+
key = 'CUDA_VISIBLE_DEVICES'
|
115
|
+
if key in envs and key in os.environ:
|
116
|
+
# overwriting CUDA_VISIBLE_DEVICES is desired behavior
|
117
|
+
# suppress the warning in `update_environment_variables`
|
118
|
+
del os.environ[key]
|
119
|
+
update_environment_variables(envs)
|
120
|
+
|
121
|
+
def init_worker(self, *args, **kwargs):
|
122
|
+
"""
|
123
|
+
Actual initialization of the worker class, and set up
|
124
|
+
function tracing if required.
|
125
|
+
Arguments are passed to the worker class constructor.
|
126
|
+
"""
|
127
|
+
enable_trace_function_call_for_thread()
|
128
|
+
|
129
|
+
mod = importlib.import_module(self.worker_module_name)
|
130
|
+
worker_class = getattr(mod, self.worker_class_name)
|
131
|
+
self.worker = worker_class(*args, **kwargs)
|
132
|
+
|
133
|
+
def execute_method(self, method, *args, **kwargs):
|
134
|
+
try:
|
135
|
+
target = self if self.worker is None else self.worker
|
136
|
+
executor = getattr(target, method)
|
137
|
+
return executor(*args, **kwargs)
|
138
|
+
except Exception as e:
|
139
|
+
# if the driver worker also execute methods,
|
140
|
+
# exceptions in the rest worker may cause deadlock in rpc like ray
|
141
|
+
# see https://github.com/vllm-project/vllm/issues/3455
|
142
|
+
# print the error and inform the user to solve the error
|
143
|
+
msg = (f"Error executing method {method}. "
|
144
|
+
"This might cause deadlock in distributed execution.")
|
145
|
+
logger.exception(msg)
|
146
|
+
raise e
|
@@ -0,0 +1,201 @@
|
|
1
|
+
Apache License
|
2
|
+
Version 2.0, January 2004
|
3
|
+
http://www.apache.org/licenses/
|
4
|
+
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6
|
+
|
7
|
+
1. Definitions.
|
8
|
+
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
11
|
+
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13
|
+
the copyright owner that is granting the License.
|
14
|
+
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
16
|
+
other entities that control, are controlled by, or are under common
|
17
|
+
control with that entity. For the purposes of this definition,
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
19
|
+
direction or management of such entity, whether by contract or
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22
|
+
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24
|
+
exercising permissions granted by this License.
|
25
|
+
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
27
|
+
including but not limited to software source code, documentation
|
28
|
+
source, and configuration files.
|
29
|
+
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
31
|
+
transformation or translation of a Source form, including but
|
32
|
+
not limited to compiled object code, generated documentation,
|
33
|
+
and conversions to other media types.
|
34
|
+
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
36
|
+
Object form, made available under the License, as indicated by a
|
37
|
+
copyright notice that is included in or attached to the work
|
38
|
+
(an example is provided in the Appendix below).
|
39
|
+
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46
|
+
the Work and Derivative Works thereof.
|
47
|
+
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
49
|
+
the original version of the Work and any modifications or additions
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
61
|
+
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
64
|
+
subsequently incorporated within the Work.
|
65
|
+
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
72
|
+
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78
|
+
where such license applies only to those patent claims licensable
|
79
|
+
by such Contributor that are necessarily infringed by their
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
82
|
+
institute patent litigation against any entity (including a
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
85
|
+
or contributory patent infringement, then any patent licenses
|
86
|
+
granted to You under this License for that Work shall terminate
|
87
|
+
as of the date such litigation is filed.
|
88
|
+
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
91
|
+
modifications, and in Source or Object form, provided that You
|
92
|
+
meet the following conditions:
|
93
|
+
|
94
|
+
(a) You must give any other recipients of the Work or
|
95
|
+
Derivative Works a copy of this License; and
|
96
|
+
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
98
|
+
stating that You changed the files; and
|
99
|
+
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
102
|
+
attribution notices from the Source form of the Work,
|
103
|
+
excluding those notices that do not pertain to any part of
|
104
|
+
the Derivative Works; and
|
105
|
+
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
108
|
+
include a readable copy of the attribution notices contained
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
111
|
+
of the following places: within a NOTICE text file distributed
|
112
|
+
as part of the Derivative Works; within the Source form or
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
114
|
+
within a display generated by the Derivative Works, if and
|
115
|
+
wherever such third-party notices normally appear. The contents
|
116
|
+
of the NOTICE file are for informational purposes only and
|
117
|
+
do not modify the License. You may add Your own attribution
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
120
|
+
that such additional attribution notices cannot be construed
|
121
|
+
as modifying the License.
|
122
|
+
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
124
|
+
may provide additional or different license terms and conditions
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
128
|
+
the conditions stated in this License.
|
129
|
+
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
133
|
+
this License, without any additional terms or conditions.
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135
|
+
the terms of any separate license agreement you may have executed
|
136
|
+
with Licensor regarding such Contributions.
|
137
|
+
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
140
|
+
except as required for reasonable and customary use in describing the
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
142
|
+
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
152
|
+
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
158
|
+
incidental, or consequential damages of any character arising as a
|
159
|
+
result of this License or out of the use or inability to use the
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
162
|
+
other commercial damages or losses), even if such Contributor
|
163
|
+
has been advised of the possibility of such damages.
|
164
|
+
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168
|
+
or other liability obligations and/or rights consistent with this
|
169
|
+
License. However, in accepting such obligations, You may act only
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
174
|
+
of your accepting any such warranty or additional liability.
|
175
|
+
|
176
|
+
END OF TERMS AND CONDITIONS
|
177
|
+
|
178
|
+
APPENDIX: How to apply the Apache License to your work.
|
179
|
+
|
180
|
+
To apply the Apache License to your work, attach the following
|
181
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182
|
+
replaced with your own identifying information. (Don't include
|
183
|
+
the brackets!) The text should be enclosed in the appropriate
|
184
|
+
comment syntax for the file format. We also recommend that a
|
185
|
+
file or class name and description of purpose be included on the
|
186
|
+
same "printed page" as the copyright notice for easier
|
187
|
+
identification within third-party archives.
|
188
|
+
|
189
|
+
Copyright [yyyy] [name of copyright owner]
|
190
|
+
|
191
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192
|
+
you may not use this file except in compliance with the License.
|
193
|
+
You may obtain a copy of the License at
|
194
|
+
|
195
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
196
|
+
|
197
|
+
Unless required by applicable law or agreed to in writing, software
|
198
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200
|
+
See the License for the specific language governing permissions and
|
201
|
+
limitations under the License.
|
@@ -0,0 +1,173 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: vllm_npu
|
3
|
+
Version: 0.4.2
|
4
|
+
Summary: A high-throughput and memory-efficient inference and serving engine for LLMs
|
5
|
+
Home-page: https://github.com/vllm-project/vllm
|
6
|
+
Author: vLLM Team
|
7
|
+
License: Apache 2.0
|
8
|
+
Project-URL: Homepage, https://github.com/vllm-project/vllm
|
9
|
+
Project-URL: Documentation, https://vllm.readthedocs.io/en/latest/
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16
|
+
Requires-Python: >=3.8
|
17
|
+
Description-Content-Type: text/markdown
|
18
|
+
License-File: LICENSE
|
19
|
+
Requires-Dist: cmake>=3.21
|
20
|
+
Requires-Dist: ninja
|
21
|
+
Requires-Dist: psutil
|
22
|
+
Requires-Dist: sentencepiece
|
23
|
+
Requires-Dist: numpy
|
24
|
+
Requires-Dist: requests
|
25
|
+
Requires-Dist: py-cpuinfo
|
26
|
+
Requires-Dist: transformers>=4.40.0
|
27
|
+
Requires-Dist: tokenizers>=0.19.1
|
28
|
+
Requires-Dist: fastapi
|
29
|
+
Requires-Dist: openai
|
30
|
+
Requires-Dist: uvicorn[standard]
|
31
|
+
Requires-Dist: pydantic>=2.0
|
32
|
+
Requires-Dist: prometheus_client>=0.18.0
|
33
|
+
Requires-Dist: prometheus-fastapi-instrumentator>=7.0.0
|
34
|
+
Requires-Dist: tiktoken==0.6.0
|
35
|
+
Requires-Dist: lm-format-enforcer==0.10.1
|
36
|
+
Requires-Dist: typing_extensions
|
37
|
+
Requires-Dist: filelock>=3.10.4
|
38
|
+
Requires-Dist: ray==2.9.3
|
39
|
+
Requires-Dist: pynvml==11.5.0
|
40
|
+
Requires-Dist: outlines==0.0.34
|
41
|
+
Requires-Dist: npu-vllm==0.4.2
|
42
|
+
Provides-Extra: tensorizer
|
43
|
+
Requires-Dist: tensorizer==2.9.0; extra == "tensorizer"
|
44
|
+
Dynamic: author
|
45
|
+
Dynamic: classifier
|
46
|
+
Dynamic: description
|
47
|
+
Dynamic: description-content-type
|
48
|
+
Dynamic: home-page
|
49
|
+
Dynamic: license
|
50
|
+
Dynamic: project-url
|
51
|
+
Dynamic: provides-extra
|
52
|
+
Dynamic: requires-dist
|
53
|
+
Dynamic: requires-python
|
54
|
+
Dynamic: summary
|
55
|
+
|
56
|
+
<p align="center">
|
57
|
+
<picture>
|
58
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
59
|
+
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
60
|
+
</picture>
|
61
|
+
</p>
|
62
|
+
|
63
|
+
<h3 align="center">
|
64
|
+
Easy, fast, and cheap LLM serving for everyone
|
65
|
+
</h3>
|
66
|
+
|
67
|
+
<p align="center">
|
68
|
+
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
|
69
|
+
|
70
|
+
</p>
|
71
|
+
|
72
|
+
*Latest News* 🔥
|
73
|
+
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
74
|
+
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
75
|
+
- [2024/01] Added ROCm 6.0 support to vLLM.
|
76
|
+
- [2023/12] Added ROCm 5.7 support to vLLM.
|
77
|
+
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
78
|
+
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
|
79
|
+
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
|
80
|
+
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
81
|
+
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
|
82
|
+
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
|
83
|
+
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
84
|
+
|
85
|
+
---
|
86
|
+
## About
|
87
|
+
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
88
|
+
|
89
|
+
vLLM is fast with:
|
90
|
+
|
91
|
+
- State-of-the-art serving throughput
|
92
|
+
- Efficient management of attention key and value memory with **PagedAttention**
|
93
|
+
- Continuous batching of incoming requests
|
94
|
+
- Fast model execution with CUDA/HIP graph
|
95
|
+
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
|
96
|
+
- Optimized CUDA kernels
|
97
|
+
|
98
|
+
vLLM is flexible and easy to use with:
|
99
|
+
|
100
|
+
- Seamless integration with popular Hugging Face models
|
101
|
+
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
102
|
+
- Tensor parallelism support for distributed inference
|
103
|
+
- Streaming outputs
|
104
|
+
- OpenAI-compatible API server
|
105
|
+
- Support NVIDIA GPUs and AMD GPUs
|
106
|
+
- (Experimental) Prefix caching support
|
107
|
+
- (Experimental) Multi-lora support
|
108
|
+
|
109
|
+
vLLM seamlessly supports many Hugging Face models, including the following architectures:
|
110
|
+
|
111
|
+
- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
|
112
|
+
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
|
113
|
+
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
114
|
+
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
|
115
|
+
- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
|
116
|
+
- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
|
117
|
+
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
|
118
|
+
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
119
|
+
- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
|
120
|
+
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
|
121
|
+
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
122
|
+
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
|
123
|
+
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
124
|
+
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
|
125
|
+
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
|
126
|
+
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
|
127
|
+
- LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
128
|
+
- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
|
129
|
+
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
|
130
|
+
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
|
131
|
+
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
132
|
+
- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
|
133
|
+
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
134
|
+
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
|
135
|
+
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
|
136
|
+
- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
|
137
|
+
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
138
|
+
- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
|
139
|
+
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
|
140
|
+
- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
|
141
|
+
- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
|
142
|
+
- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
|
143
|
+
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
|
144
|
+
|
145
|
+
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
146
|
+
|
147
|
+
```bash
|
148
|
+
pip install vllm
|
149
|
+
```
|
150
|
+
|
151
|
+
## Getting Started
|
152
|
+
|
153
|
+
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
|
154
|
+
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
155
|
+
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
156
|
+
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
157
|
+
|
158
|
+
## Contributing
|
159
|
+
|
160
|
+
We welcome and value any contributions and collaborations.
|
161
|
+
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
162
|
+
|
163
|
+
## Citation
|
164
|
+
|
165
|
+
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
166
|
+
```bibtex
|
167
|
+
@inproceedings{kwon2023efficient,
|
168
|
+
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
169
|
+
author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
|
170
|
+
booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
|
171
|
+
year={2023}
|
172
|
+
}
|
173
|
+
```
|