vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
vllm/envs.py
ADDED
@@ -0,0 +1,217 @@
|
|
1
|
+
import os
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
|
3
|
+
|
4
|
+
if TYPE_CHECKING:
|
5
|
+
VLLM_HOST_IP: str = ""
|
6
|
+
VLLM_USE_MODELSCOPE: bool = False
|
7
|
+
VLLM_INSTANCE_ID: Optional[str] = None
|
8
|
+
VLLM_NCCL_SO_PATH: Optional[str] = None
|
9
|
+
LD_LIBRARY_PATH: Optional[str] = None
|
10
|
+
VLLM_USE_TRITON_FLASH_ATTN: bool = False
|
11
|
+
LOCAL_RANK: int = 0
|
12
|
+
CUDA_VISIBLE_DEVICES: Optional[str] = None
|
13
|
+
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
|
14
|
+
VLLM_API_KEY: Optional[str] = None
|
15
|
+
S3_ACCESS_KEY_ID: Optional[str] = None
|
16
|
+
S3_SECRET_ACCESS_KEY: Optional[str] = None
|
17
|
+
S3_ENDPOINT_URL: Optional[str] = None
|
18
|
+
VLLM_CONFIG_ROOT: str = ""
|
19
|
+
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
20
|
+
VLLM_NO_USAGE_STATS: bool = False
|
21
|
+
VLLM_DO_NOT_TRACK: bool = False
|
22
|
+
VLLM_USAGE_SOURCE: str = ""
|
23
|
+
VLLM_CONFIGURE_LOGGING: int = 1
|
24
|
+
VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
|
25
|
+
VLLM_TRACE_FUNCTION: int = 0
|
26
|
+
VLLM_ATTENTION_BACKEND: Optional[str] = None
|
27
|
+
VLLM_CPU_KVCACHE_SPACE: int = 0
|
28
|
+
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
29
|
+
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
|
30
|
+
VLLM_TARGET_DEVICE: str = "cuda"
|
31
|
+
MAX_JOBS: Optional[str] = None
|
32
|
+
NVCC_THREADS: Optional[str] = None
|
33
|
+
VLLM_BUILD_WITH_NEURON: bool = False
|
34
|
+
VLLM_USE_PRECOMPILED: bool = False
|
35
|
+
VLLM_INSTALL_PUNICA_KERNELS: bool = False
|
36
|
+
CMAKE_BUILD_TYPE: Optional[str] = None
|
37
|
+
VERBOSE: bool = False
|
38
|
+
|
39
|
+
# The begin-* and end* here are used by the documentation generator
|
40
|
+
# to extract the used env vars.
|
41
|
+
|
42
|
+
# begin-env-vars-definition
|
43
|
+
|
44
|
+
environment_variables: Dict[str, Callable[[], Any]] = {
|
45
|
+
|
46
|
+
# ================== Installation Time Env Vars ==================
|
47
|
+
|
48
|
+
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
|
49
|
+
"VLLM_TARGET_DEVICE":
|
50
|
+
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
|
51
|
+
|
52
|
+
# Maximum number of compilation jobs to run in parallel.
|
53
|
+
# By default this is the number of CPUs
|
54
|
+
"MAX_JOBS":
|
55
|
+
lambda: os.getenv("MAX_JOBS", None),
|
56
|
+
|
57
|
+
# Number of threads to use for nvcc
|
58
|
+
# By default this is 1.
|
59
|
+
# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
|
60
|
+
"NVCC_THREADS":
|
61
|
+
lambda: os.getenv("NVCC_THREADS", None),
|
62
|
+
|
63
|
+
# If set, vllm will build with Neuron support
|
64
|
+
"VLLM_BUILD_WITH_NEURON":
|
65
|
+
lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
|
66
|
+
|
67
|
+
# If set, vllm will use precompiled binaries (*.so)
|
68
|
+
"VLLM_USE_PRECOMPILED":
|
69
|
+
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
|
70
|
+
|
71
|
+
# If set, vllm will install Punica kernels
|
72
|
+
"VLLM_INSTALL_PUNICA_KERNELS":
|
73
|
+
lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
|
74
|
+
|
75
|
+
# CMake build type
|
76
|
+
# If not set, defaults to "Debug" or "RelWithDebInfo"
|
77
|
+
# Available options: "Debug", "Release", "RelWithDebInfo"
|
78
|
+
"CMAKE_BUILD_TYPE":
|
79
|
+
lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
80
|
+
|
81
|
+
# If set, vllm will print verbose logs during installation
|
82
|
+
"VERBOSE":
|
83
|
+
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
84
|
+
|
85
|
+
# Root directory for VLLM configuration files
|
86
|
+
# Note that this not only affects how vllm finds its configuration files
|
87
|
+
# during runtime, but also affects how vllm installs its configuration
|
88
|
+
# files during **installation**.
|
89
|
+
"VLLM_CONFIG_ROOT":
|
90
|
+
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
|
91
|
+
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
|
92
|
+
|
93
|
+
# ================== Runtime Env Vars ==================
|
94
|
+
|
95
|
+
# used in distributed environment to determine the master address
|
96
|
+
'VLLM_HOST_IP':
|
97
|
+
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
|
98
|
+
|
99
|
+
# If true, will load models from ModelScope instead of Hugging Face Hub.
|
100
|
+
# note that the value is true or false, not numbers
|
101
|
+
"VLLM_USE_MODELSCOPE":
|
102
|
+
lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
|
103
|
+
|
104
|
+
# Instance id represents an instance of the VLLM. All processes in the same
|
105
|
+
# instance should have the same instance id.
|
106
|
+
"VLLM_INSTANCE_ID":
|
107
|
+
lambda: os.environ.get("VLLM_INSTANCE_ID", None),
|
108
|
+
|
109
|
+
# path to cudatoolkit home directory, under which should be bin, include,
|
110
|
+
# and lib directories.
|
111
|
+
"CUDA_HOME":
|
112
|
+
lambda: os.environ.get("CUDA_HOME", None),
|
113
|
+
|
114
|
+
# Path to the NCCL library file. It is needed because nccl>=2.19 brought
|
115
|
+
# by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
|
116
|
+
"VLLM_NCCL_SO_PATH":
|
117
|
+
lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
|
118
|
+
|
119
|
+
# when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
|
120
|
+
# library file in the locations specified by `LD_LIBRARY_PATH`
|
121
|
+
"LD_LIBRARY_PATH":
|
122
|
+
lambda: os.environ.get("LD_LIBRARY_PATH", None),
|
123
|
+
|
124
|
+
# flag to control if vllm should use triton flash attention
|
125
|
+
"VLLM_USE_TRITON_FLASH_ATTN":
|
126
|
+
lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
|
127
|
+
("true", "1")),
|
128
|
+
|
129
|
+
# local rank of the process in the distributed setting, used to determine
|
130
|
+
# the GPU device id
|
131
|
+
"LOCAL_RANK":
|
132
|
+
lambda: int(os.environ.get("LOCAL_RANK", "0")),
|
133
|
+
|
134
|
+
# used to control the visible devices in the distributed setting
|
135
|
+
"CUDA_VISIBLE_DEVICES":
|
136
|
+
lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
|
137
|
+
|
138
|
+
# timeout for each iteration in the engine
|
139
|
+
"VLLM_ENGINE_ITERATION_TIMEOUT_S":
|
140
|
+
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
|
141
|
+
|
142
|
+
# API key for VLLM API server
|
143
|
+
"VLLM_API_KEY":
|
144
|
+
lambda: os.environ.get("VLLM_API_KEY", None),
|
145
|
+
|
146
|
+
# S3 access information, used for tensorizer to load model from S3
|
147
|
+
"S3_ACCESS_KEY_ID":
|
148
|
+
lambda: os.environ.get("S3_ACCESS_KEY", None),
|
149
|
+
"S3_SECRET_ACCESS_KEY":
|
150
|
+
lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
|
151
|
+
"S3_ENDPOINT_URL":
|
152
|
+
lambda: os.environ.get("S3_ENDPOINT_URL", None),
|
153
|
+
|
154
|
+
# Usage stats collection
|
155
|
+
"VLLM_USAGE_STATS_SERVER":
|
156
|
+
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
|
157
|
+
"VLLM_NO_USAGE_STATS":
|
158
|
+
lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
|
159
|
+
"VLLM_DO_NOT_TRACK":
|
160
|
+
lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
|
161
|
+
"DO_NOT_TRACK", None) or "0") == "1",
|
162
|
+
"VLLM_USAGE_SOURCE":
|
163
|
+
lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
|
164
|
+
|
165
|
+
# Logging configuration
|
166
|
+
# If set to 0, vllm will not configure logging
|
167
|
+
# If set to 1, vllm will configure logging using the default configuration
|
168
|
+
# or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
|
169
|
+
"VLLM_CONFIGURE_LOGGING":
|
170
|
+
lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
|
171
|
+
"VLLM_LOGGING_CONFIG_PATH":
|
172
|
+
lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
|
173
|
+
|
174
|
+
# Trace function calls
|
175
|
+
# If set to 1, vllm will trace function calls
|
176
|
+
# Useful for debugging
|
177
|
+
"VLLM_TRACE_FUNCTION":
|
178
|
+
lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
|
179
|
+
|
180
|
+
# Backend for attention computation
|
181
|
+
# Available options:
|
182
|
+
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
|
183
|
+
# - "FLASH_ATTN": use FlashAttention
|
184
|
+
# - "XFORMERS": use XFormers
|
185
|
+
# - "ROCM_FLASH": use ROCmFlashAttention
|
186
|
+
"VLLM_ATTENTION_BACKEND":
|
187
|
+
lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
|
188
|
+
|
189
|
+
# CPU key-value cache space
|
190
|
+
# default is 4GB
|
191
|
+
"VLLM_CPU_KVCACHE_SPACE":
|
192
|
+
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
|
193
|
+
|
194
|
+
# If the env var is set, it uses the Ray's compiled DAG API
|
195
|
+
# which optimizes the control plane overhead.
|
196
|
+
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
|
197
|
+
"VLLM_USE_RAY_COMPILED_DAG":
|
198
|
+
lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
|
199
|
+
|
200
|
+
# Use dedicated multiprocess context for workers.
|
201
|
+
# Both spawn and fork work
|
202
|
+
"VLLM_WORKER_MULTIPROC_METHOD":
|
203
|
+
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
|
204
|
+
}
|
205
|
+
|
206
|
+
# end-env-vars-definition
|
207
|
+
|
208
|
+
|
209
|
+
def __getattr__(name):
|
210
|
+
# lazy evaluation of environment variables
|
211
|
+
if name in environment_variables:
|
212
|
+
return environment_variables[name]()
|
213
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
214
|
+
|
215
|
+
|
216
|
+
def __dir__():
|
217
|
+
return list(environment_variables.keys())
|
File without changes
|
@@ -0,0 +1,152 @@
|
|
1
|
+
from typing import List, Set, Tuple
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
import vllm.envs as envs
|
6
|
+
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
|
7
|
+
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
8
|
+
from vllm.logger import init_logger
|
9
|
+
from vllm.lora.request import LoRARequest
|
10
|
+
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
11
|
+
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
12
|
+
make_async)
|
13
|
+
|
14
|
+
logger = init_logger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class CPUExecutor(ExecutorBase):
|
18
|
+
|
19
|
+
def _init_executor(self) -> None:
|
20
|
+
assert self.device_config.device_type == "cpu"
|
21
|
+
assert self.lora_config is None, "cpu backend doesn't support LoRA"
|
22
|
+
self.model_config = _verify_and_get_model_config(self.model_config)
|
23
|
+
self.cache_config = _verify_and_get_cache_config(self.cache_config)
|
24
|
+
self.scheduler_config = _verify_and_get_scheduler_config(
|
25
|
+
self.scheduler_config)
|
26
|
+
|
27
|
+
# Instantiate the worker and load the model to CPU.
|
28
|
+
self._init_worker()
|
29
|
+
|
30
|
+
def _init_worker(self):
|
31
|
+
from vllm.worker.cpu_worker import CPUWorker
|
32
|
+
|
33
|
+
assert self.parallel_config.world_size == 1, (
|
34
|
+
"CPUExecutor only supports single CPU socket currently.")
|
35
|
+
|
36
|
+
distributed_init_method = get_distributed_init_method(
|
37
|
+
get_ip(), get_open_port())
|
38
|
+
self.driver_worker = CPUWorker(
|
39
|
+
model_config=self.model_config,
|
40
|
+
parallel_config=self.parallel_config,
|
41
|
+
scheduler_config=self.scheduler_config,
|
42
|
+
device_config=self.device_config,
|
43
|
+
cache_config=self.cache_config,
|
44
|
+
load_config=self.load_config,
|
45
|
+
local_rank=0,
|
46
|
+
rank=0,
|
47
|
+
distributed_init_method=distributed_init_method,
|
48
|
+
lora_config=self.lora_config,
|
49
|
+
vision_language_config=self.vision_language_config,
|
50
|
+
kv_cache_dtype=self.cache_config.cache_dtype,
|
51
|
+
is_driver_worker=True,
|
52
|
+
)
|
53
|
+
self.driver_worker.init_device()
|
54
|
+
self.driver_worker.load_model()
|
55
|
+
|
56
|
+
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
57
|
+
"""Determine the number of available KV blocks by invoking the
|
58
|
+
underlying worker.
|
59
|
+
"""
|
60
|
+
return self.driver_worker.determine_num_available_blocks()
|
61
|
+
|
62
|
+
def initialize_cache(self, num_gpu_blocks: int,
|
63
|
+
num_cpu_blocks: int) -> None:
|
64
|
+
"""Initialize the KV cache by invoking the underlying worker.
|
65
|
+
"""
|
66
|
+
# NOTE: We log here to avoid multiple logs when number of workers is
|
67
|
+
# greater than one. We could log in the engine, but not all executors
|
68
|
+
# have GPUs.
|
69
|
+
# NOTE: `cpu block` for CPU backend is located on CPU memory but is
|
70
|
+
# referred as `gpu block`. Because we want to reuse the existing block
|
71
|
+
# management procedure.
|
72
|
+
logger.info("# CPU blocks: %d", num_gpu_blocks)
|
73
|
+
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
|
74
|
+
|
75
|
+
def execute_model(
|
76
|
+
self,
|
77
|
+
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
78
|
+
output = self.driver_worker.execute_model(execute_model_req)
|
79
|
+
return output
|
80
|
+
|
81
|
+
def add_lora(self, lora_request: LoRARequest) -> bool:
|
82
|
+
return self.driver_worker.add_lora(lora_request)
|
83
|
+
|
84
|
+
def remove_lora(self, lora_id: int) -> bool:
|
85
|
+
return self.driver_worker.remove_lora(lora_id)
|
86
|
+
|
87
|
+
def list_loras(self) -> Set[int]:
|
88
|
+
return self.driver_worker.list_loras()
|
89
|
+
|
90
|
+
def check_health(self) -> None:
|
91
|
+
# CPUExecutor will always be healthy as long as
|
92
|
+
# it's running.
|
93
|
+
return
|
94
|
+
|
95
|
+
|
96
|
+
class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
|
97
|
+
|
98
|
+
async def execute_model_async(
|
99
|
+
self,
|
100
|
+
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
101
|
+
output = await make_async(self.driver_worker.execute_model
|
102
|
+
)(execute_model_req=execute_model_req, )
|
103
|
+
return output
|
104
|
+
|
105
|
+
async def check_health_async(self) -> None:
|
106
|
+
# CPUExecutor will always be healthy as long as
|
107
|
+
# it's running.
|
108
|
+
return
|
109
|
+
|
110
|
+
|
111
|
+
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
|
112
|
+
if config.dtype == torch.float16:
|
113
|
+
logger.warning("float16 is not supported on CPU, casting to bfloat16.")
|
114
|
+
config.dtype = torch.bfloat16
|
115
|
+
if not config.enforce_eager:
|
116
|
+
logger.warning(
|
117
|
+
"CUDA graph is not supported on CPU, fallback to the eager "
|
118
|
+
"mode.")
|
119
|
+
config.enforce_eager = True
|
120
|
+
return config
|
121
|
+
|
122
|
+
|
123
|
+
def _verify_and_get_scheduler_config(
|
124
|
+
config: SchedulerConfig) -> SchedulerConfig:
|
125
|
+
if config.chunked_prefill_enabled:
|
126
|
+
logger.warning("Chunked prefill is not supported on CPU, disable it.")
|
127
|
+
config.chunked_prefill_enabled = False
|
128
|
+
|
129
|
+
return config
|
130
|
+
|
131
|
+
|
132
|
+
def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
|
133
|
+
_GB = 1 << 30
|
134
|
+
if config.enable_prefix_caching:
|
135
|
+
logger.warning("Prefix caching is not supported on CPU, disable it.")
|
136
|
+
config.enable_prefix_caching = False
|
137
|
+
|
138
|
+
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
|
139
|
+
|
140
|
+
if kv_cache_space >= 0:
|
141
|
+
if kv_cache_space == 0:
|
142
|
+
config.cpu_kvcache_space_bytes = 4 * _GB # type: ignore
|
143
|
+
logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
|
144
|
+
"for CPU backend is not set, using 4 by default.")
|
145
|
+
else:
|
146
|
+
config.cpu_kvcache_space_bytes = kv_cache_space * _GB # type: ignore
|
147
|
+
else:
|
148
|
+
raise RuntimeError(
|
149
|
+
"Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
|
150
|
+
f" {kv_cache_space}, expect a positive integer value.")
|
151
|
+
|
152
|
+
return config
|
@@ -0,0 +1,115 @@
|
|
1
|
+
from abc import abstractmethod
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
3
|
+
|
4
|
+
from vllm.executor.executor_base import ExecutorAsyncBase
|
5
|
+
from vllm.executor.gpu_executor import GPUExecutor
|
6
|
+
from vllm.logger import init_logger
|
7
|
+
from vllm.lora.request import LoRARequest
|
8
|
+
from vllm.sequence import SamplerOutput
|
9
|
+
|
10
|
+
logger = init_logger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class DistributedGPUExecutor(GPUExecutor):
|
14
|
+
"""Abstract superclass of multi-GPU executor implementations."""
|
15
|
+
|
16
|
+
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
17
|
+
"""Determine the number of available KV blocks.
|
18
|
+
|
19
|
+
This invokes `determine_num_available_blocks` on each worker and takes
|
20
|
+
the min of the results, guaranteeing that the selected cache sizes are
|
21
|
+
compatible with all workers.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
- tuple[num_gpu_blocks, num_cpu_blocks]
|
25
|
+
"""
|
26
|
+
# Get the maximum number of blocks that can be allocated on GPU and CPU.
|
27
|
+
num_blocks = self._run_workers("determine_num_available_blocks", )
|
28
|
+
|
29
|
+
# Since we use a shared centralized controller, we take the minimum
|
30
|
+
# number of blocks across all workers to make sure all the memory
|
31
|
+
# operators can be applied to all workers.
|
32
|
+
num_gpu_blocks = min(b[0] for b in num_blocks)
|
33
|
+
num_cpu_blocks = min(b[1] for b in num_blocks)
|
34
|
+
|
35
|
+
return num_gpu_blocks, num_cpu_blocks
|
36
|
+
|
37
|
+
def initialize_cache(self, num_gpu_blocks: int,
|
38
|
+
num_cpu_blocks: int) -> None:
|
39
|
+
"""Initialize the KV cache in all workers.
|
40
|
+
"""
|
41
|
+
|
42
|
+
# NOTE: We log here to avoid multiple logs when number of workers is
|
43
|
+
# greater than one. We could log in the engine, but not all executors
|
44
|
+
# have GPUs.
|
45
|
+
logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
|
46
|
+
num_cpu_blocks)
|
47
|
+
|
48
|
+
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
49
|
+
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
50
|
+
|
51
|
+
self._run_workers("initialize_cache",
|
52
|
+
num_gpu_blocks=num_gpu_blocks,
|
53
|
+
num_cpu_blocks=num_cpu_blocks)
|
54
|
+
|
55
|
+
def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
|
56
|
+
all_outputs = self._run_workers("execute_model",
|
57
|
+
driver_args=args,
|
58
|
+
driver_kwargs=kwargs)
|
59
|
+
|
60
|
+
# Only the driver worker returns the sampling results.
|
61
|
+
return all_outputs[0]
|
62
|
+
|
63
|
+
def add_lora(self, lora_request: LoRARequest) -> bool:
|
64
|
+
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
|
65
|
+
return self._run_workers(
|
66
|
+
"add_lora",
|
67
|
+
lora_request=lora_request,
|
68
|
+
)
|
69
|
+
|
70
|
+
def remove_lora(self, lora_id: int) -> bool:
|
71
|
+
assert lora_id > 0, "lora_id must be greater than 0."
|
72
|
+
return self._run_workers(
|
73
|
+
"remove_lora",
|
74
|
+
lora_id=lora_id,
|
75
|
+
)
|
76
|
+
|
77
|
+
def list_loras(self) -> Set[int]:
|
78
|
+
return self._run_workers("list_loras")
|
79
|
+
|
80
|
+
@abstractmethod
|
81
|
+
def _run_workers(
|
82
|
+
self,
|
83
|
+
method: str,
|
84
|
+
*args,
|
85
|
+
driver_args: Optional[Tuple[Any, ...]] = None,
|
86
|
+
driver_kwargs: Optional[Dict[str, Any]] = None,
|
87
|
+
max_concurrent_workers: Optional[int] = None,
|
88
|
+
**kwargs,
|
89
|
+
) -> Any:
|
90
|
+
"""Runs the given method on all workers."""
|
91
|
+
raise NotImplementedError
|
92
|
+
|
93
|
+
|
94
|
+
class DistributedGPUExecutorAsync(DistributedGPUExecutor, ExecutorAsyncBase):
|
95
|
+
|
96
|
+
@abstractmethod
|
97
|
+
async def _run_workers_async(
|
98
|
+
self,
|
99
|
+
method: str,
|
100
|
+
*args,
|
101
|
+
driver_args: Optional[Tuple[Any, ...]] = None,
|
102
|
+
driver_kwargs: Optional[Dict[str, Any]] = None,
|
103
|
+
**kwargs,
|
104
|
+
) -> Any:
|
105
|
+
"""Runs the given method on all workers."""
|
106
|
+
raise NotImplementedError
|
107
|
+
|
108
|
+
async def execute_model_async(self, *args,
|
109
|
+
**kwargs) -> List[SamplerOutput]:
|
110
|
+
all_outputs = await self._run_workers_async("execute_model",
|
111
|
+
driver_args=args,
|
112
|
+
driver_kwargs=kwargs)
|
113
|
+
|
114
|
+
# Only the driver worker returns the sampling results.
|
115
|
+
return all_outputs[0]
|
@@ -0,0 +1,115 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import List, Optional, Set, Tuple
|
3
|
+
|
4
|
+
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
5
|
+
ModelConfig, ParallelConfig, SchedulerConfig,
|
6
|
+
SpeculativeConfig, VisionLanguageConfig)
|
7
|
+
from vllm.lora.request import LoRARequest
|
8
|
+
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
9
|
+
|
10
|
+
|
11
|
+
class ExecutorBase(ABC):
|
12
|
+
"""Base class for all executors.
|
13
|
+
|
14
|
+
An executor is responsible for executing the model on a specific device
|
15
|
+
type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
|
16
|
+
that can execute the model on multiple devices.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(
|
20
|
+
self,
|
21
|
+
model_config: ModelConfig,
|
22
|
+
cache_config: CacheConfig,
|
23
|
+
parallel_config: ParallelConfig,
|
24
|
+
scheduler_config: SchedulerConfig,
|
25
|
+
device_config: DeviceConfig,
|
26
|
+
load_config: LoadConfig,
|
27
|
+
lora_config: Optional[LoRAConfig],
|
28
|
+
vision_language_config: Optional[VisionLanguageConfig],
|
29
|
+
speculative_config: Optional[SpeculativeConfig],
|
30
|
+
) -> None:
|
31
|
+
self.model_config = model_config
|
32
|
+
self.cache_config = cache_config
|
33
|
+
self.lora_config = lora_config
|
34
|
+
self.load_config = load_config
|
35
|
+
self.parallel_config = parallel_config
|
36
|
+
self.scheduler_config = scheduler_config
|
37
|
+
self.device_config = device_config
|
38
|
+
self.vision_language_config = vision_language_config
|
39
|
+
self.speculative_config = speculative_config
|
40
|
+
|
41
|
+
self._init_executor()
|
42
|
+
|
43
|
+
@abstractmethod
|
44
|
+
def _init_executor(self) -> None:
|
45
|
+
pass
|
46
|
+
|
47
|
+
@abstractmethod
|
48
|
+
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
49
|
+
"""Determine the number of available blocks for the GPU KV cache and
|
50
|
+
swappable CPU KV cache.
|
51
|
+
|
52
|
+
Normally, this should simply delegate to the underlying Worker. Some
|
53
|
+
ExecutorBase may require modification of the result, e.g. to ensure the
|
54
|
+
selected cache sizes are compatible with all workers.
|
55
|
+
|
56
|
+
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
|
57
|
+
are blocks that are "active" on the device and can be appended to.
|
58
|
+
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
|
59
|
+
appended to.
|
60
|
+
"""
|
61
|
+
raise NotImplementedError
|
62
|
+
|
63
|
+
@abstractmethod
|
64
|
+
def initialize_cache(self, num_gpu_blocks: int,
|
65
|
+
num_cpu_blocks: int) -> None:
|
66
|
+
"""Initialize the KV cache with the given size in blocks.
|
67
|
+
"""
|
68
|
+
raise NotImplementedError
|
69
|
+
|
70
|
+
@abstractmethod
|
71
|
+
def execute_model(
|
72
|
+
self,
|
73
|
+
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
74
|
+
"""Executes at least one model step on the given sequences."""
|
75
|
+
raise NotImplementedError
|
76
|
+
|
77
|
+
@abstractmethod
|
78
|
+
def add_lora(self, lora_request: LoRARequest) -> bool:
|
79
|
+
raise NotImplementedError
|
80
|
+
|
81
|
+
@abstractmethod
|
82
|
+
def remove_lora(self, lora_id: int) -> bool:
|
83
|
+
raise NotImplementedError
|
84
|
+
|
85
|
+
@abstractmethod
|
86
|
+
def list_loras(self) -> Set[int]:
|
87
|
+
raise NotImplementedError
|
88
|
+
|
89
|
+
@abstractmethod
|
90
|
+
def check_health(self) -> None:
|
91
|
+
"""Checks if the executor is healthy. If not, it should raise an
|
92
|
+
exception."""
|
93
|
+
raise NotImplementedError
|
94
|
+
|
95
|
+
def shutdown(self) -> None:
|
96
|
+
"""Shutdown the executor."""
|
97
|
+
return
|
98
|
+
|
99
|
+
def __del__(self):
|
100
|
+
self.shutdown()
|
101
|
+
|
102
|
+
|
103
|
+
class ExecutorAsyncBase(ExecutorBase):
|
104
|
+
|
105
|
+
@abstractmethod
|
106
|
+
async def execute_model_async(
|
107
|
+
self,
|
108
|
+
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
109
|
+
"""Executes one model step on the given sequences."""
|
110
|
+
raise NotImplementedError
|
111
|
+
|
112
|
+
async def check_health_async(self) -> None:
|
113
|
+
"""Checks if the executor is healthy. If not, it should raise an
|
114
|
+
exception."""
|
115
|
+
self.check_health()
|