vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
vllm/envs.py ADDED
@@ -0,0 +1,217 @@
1
+ import os
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ VLLM_HOST_IP: str = ""
6
+ VLLM_USE_MODELSCOPE: bool = False
7
+ VLLM_INSTANCE_ID: Optional[str] = None
8
+ VLLM_NCCL_SO_PATH: Optional[str] = None
9
+ LD_LIBRARY_PATH: Optional[str] = None
10
+ VLLM_USE_TRITON_FLASH_ATTN: bool = False
11
+ LOCAL_RANK: int = 0
12
+ CUDA_VISIBLE_DEVICES: Optional[str] = None
13
+ VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
14
+ VLLM_API_KEY: Optional[str] = None
15
+ S3_ACCESS_KEY_ID: Optional[str] = None
16
+ S3_SECRET_ACCESS_KEY: Optional[str] = None
17
+ S3_ENDPOINT_URL: Optional[str] = None
18
+ VLLM_CONFIG_ROOT: str = ""
19
+ VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
20
+ VLLM_NO_USAGE_STATS: bool = False
21
+ VLLM_DO_NOT_TRACK: bool = False
22
+ VLLM_USAGE_SOURCE: str = ""
23
+ VLLM_CONFIGURE_LOGGING: int = 1
24
+ VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
25
+ VLLM_TRACE_FUNCTION: int = 0
26
+ VLLM_ATTENTION_BACKEND: Optional[str] = None
27
+ VLLM_CPU_KVCACHE_SPACE: int = 0
28
+ VLLM_USE_RAY_COMPILED_DAG: bool = False
29
+ VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
30
+ VLLM_TARGET_DEVICE: str = "cuda"
31
+ MAX_JOBS: Optional[str] = None
32
+ NVCC_THREADS: Optional[str] = None
33
+ VLLM_BUILD_WITH_NEURON: bool = False
34
+ VLLM_USE_PRECOMPILED: bool = False
35
+ VLLM_INSTALL_PUNICA_KERNELS: bool = False
36
+ CMAKE_BUILD_TYPE: Optional[str] = None
37
+ VERBOSE: bool = False
38
+
39
+ # The begin-* and end* here are used by the documentation generator
40
+ # to extract the used env vars.
41
+
42
+ # begin-env-vars-definition
43
+
44
+ environment_variables: Dict[str, Callable[[], Any]] = {
45
+
46
+ # ================== Installation Time Env Vars ==================
47
+
48
+ # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
49
+ "VLLM_TARGET_DEVICE":
50
+ lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
51
+
52
+ # Maximum number of compilation jobs to run in parallel.
53
+ # By default this is the number of CPUs
54
+ "MAX_JOBS":
55
+ lambda: os.getenv("MAX_JOBS", None),
56
+
57
+ # Number of threads to use for nvcc
58
+ # By default this is 1.
59
+ # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
60
+ "NVCC_THREADS":
61
+ lambda: os.getenv("NVCC_THREADS", None),
62
+
63
+ # If set, vllm will build with Neuron support
64
+ "VLLM_BUILD_WITH_NEURON":
65
+ lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
66
+
67
+ # If set, vllm will use precompiled binaries (*.so)
68
+ "VLLM_USE_PRECOMPILED":
69
+ lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
70
+
71
+ # If set, vllm will install Punica kernels
72
+ "VLLM_INSTALL_PUNICA_KERNELS":
73
+ lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
74
+
75
+ # CMake build type
76
+ # If not set, defaults to "Debug" or "RelWithDebInfo"
77
+ # Available options: "Debug", "Release", "RelWithDebInfo"
78
+ "CMAKE_BUILD_TYPE":
79
+ lambda: os.getenv("CMAKE_BUILD_TYPE"),
80
+
81
+ # If set, vllm will print verbose logs during installation
82
+ "VERBOSE":
83
+ lambda: bool(int(os.getenv('VERBOSE', '0'))),
84
+
85
+ # Root directory for VLLM configuration files
86
+ # Note that this not only affects how vllm finds its configuration files
87
+ # during runtime, but also affects how vllm installs its configuration
88
+ # files during **installation**.
89
+ "VLLM_CONFIG_ROOT":
90
+ lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
91
+ "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
92
+
93
+ # ================== Runtime Env Vars ==================
94
+
95
+ # used in distributed environment to determine the master address
96
+ 'VLLM_HOST_IP':
97
+ lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
98
+
99
+ # If true, will load models from ModelScope instead of Hugging Face Hub.
100
+ # note that the value is true or false, not numbers
101
+ "VLLM_USE_MODELSCOPE":
102
+ lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true",
103
+
104
+ # Instance id represents an instance of the VLLM. All processes in the same
105
+ # instance should have the same instance id.
106
+ "VLLM_INSTANCE_ID":
107
+ lambda: os.environ.get("VLLM_INSTANCE_ID", None),
108
+
109
+ # path to cudatoolkit home directory, under which should be bin, include,
110
+ # and lib directories.
111
+ "CUDA_HOME":
112
+ lambda: os.environ.get("CUDA_HOME", None),
113
+
114
+ # Path to the NCCL library file. It is needed because nccl>=2.19 brought
115
+ # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234
116
+ "VLLM_NCCL_SO_PATH":
117
+ lambda: os.environ.get("VLLM_NCCL_SO_PATH", None),
118
+
119
+ # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
120
+ # library file in the locations specified by `LD_LIBRARY_PATH`
121
+ "LD_LIBRARY_PATH":
122
+ lambda: os.environ.get("LD_LIBRARY_PATH", None),
123
+
124
+ # flag to control if vllm should use triton flash attention
125
+ "VLLM_USE_TRITON_FLASH_ATTN":
126
+ lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
127
+ ("true", "1")),
128
+
129
+ # local rank of the process in the distributed setting, used to determine
130
+ # the GPU device id
131
+ "LOCAL_RANK":
132
+ lambda: int(os.environ.get("LOCAL_RANK", "0")),
133
+
134
+ # used to control the visible devices in the distributed setting
135
+ "CUDA_VISIBLE_DEVICES":
136
+ lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
137
+
138
+ # timeout for each iteration in the engine
139
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S":
140
+ lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
141
+
142
+ # API key for VLLM API server
143
+ "VLLM_API_KEY":
144
+ lambda: os.environ.get("VLLM_API_KEY", None),
145
+
146
+ # S3 access information, used for tensorizer to load model from S3
147
+ "S3_ACCESS_KEY_ID":
148
+ lambda: os.environ.get("S3_ACCESS_KEY", None),
149
+ "S3_SECRET_ACCESS_KEY":
150
+ lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None),
151
+ "S3_ENDPOINT_URL":
152
+ lambda: os.environ.get("S3_ENDPOINT_URL", None),
153
+
154
+ # Usage stats collection
155
+ "VLLM_USAGE_STATS_SERVER":
156
+ lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
157
+ "VLLM_NO_USAGE_STATS":
158
+ lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
159
+ "VLLM_DO_NOT_TRACK":
160
+ lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
161
+ "DO_NOT_TRACK", None) or "0") == "1",
162
+ "VLLM_USAGE_SOURCE":
163
+ lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"),
164
+
165
+ # Logging configuration
166
+ # If set to 0, vllm will not configure logging
167
+ # If set to 1, vllm will configure logging using the default configuration
168
+ # or the configuration file specified by VLLM_LOGGING_CONFIG_PATH
169
+ "VLLM_CONFIGURE_LOGGING":
170
+ lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")),
171
+ "VLLM_LOGGING_CONFIG_PATH":
172
+ lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"),
173
+
174
+ # Trace function calls
175
+ # If set to 1, vllm will trace function calls
176
+ # Useful for debugging
177
+ "VLLM_TRACE_FUNCTION":
178
+ lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")),
179
+
180
+ # Backend for attention computation
181
+ # Available options:
182
+ # - "TORCH_SDPA": use torch.nn.MultiheadAttention
183
+ # - "FLASH_ATTN": use FlashAttention
184
+ # - "XFORMERS": use XFormers
185
+ # - "ROCM_FLASH": use ROCmFlashAttention
186
+ "VLLM_ATTENTION_BACKEND":
187
+ lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
188
+
189
+ # CPU key-value cache space
190
+ # default is 4GB
191
+ "VLLM_CPU_KVCACHE_SPACE":
192
+ lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
193
+
194
+ # If the env var is set, it uses the Ray's compiled DAG API
195
+ # which optimizes the control plane overhead.
196
+ # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
197
+ "VLLM_USE_RAY_COMPILED_DAG":
198
+ lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)),
199
+
200
+ # Use dedicated multiprocess context for workers.
201
+ # Both spawn and fork work
202
+ "VLLM_WORKER_MULTIPROC_METHOD":
203
+ lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
204
+ }
205
+
206
+ # end-env-vars-definition
207
+
208
+
209
+ def __getattr__(name):
210
+ # lazy evaluation of environment variables
211
+ if name in environment_variables:
212
+ return environment_variables[name]()
213
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
214
+
215
+
216
+ def __dir__():
217
+ return list(environment_variables.keys())
File without changes
@@ -0,0 +1,152 @@
1
+ from typing import List, Set, Tuple
2
+
3
+ import torch
4
+
5
+ import vllm.envs as envs
6
+ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
7
+ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
8
+ from vllm.logger import init_logger
9
+ from vllm.lora.request import LoRARequest
10
+ from vllm.sequence import ExecuteModelRequest, SamplerOutput
11
+ from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
12
+ make_async)
13
+
14
+ logger = init_logger(__name__)
15
+
16
+
17
+ class CPUExecutor(ExecutorBase):
18
+
19
+ def _init_executor(self) -> None:
20
+ assert self.device_config.device_type == "cpu"
21
+ assert self.lora_config is None, "cpu backend doesn't support LoRA"
22
+ self.model_config = _verify_and_get_model_config(self.model_config)
23
+ self.cache_config = _verify_and_get_cache_config(self.cache_config)
24
+ self.scheduler_config = _verify_and_get_scheduler_config(
25
+ self.scheduler_config)
26
+
27
+ # Instantiate the worker and load the model to CPU.
28
+ self._init_worker()
29
+
30
+ def _init_worker(self):
31
+ from vllm.worker.cpu_worker import CPUWorker
32
+
33
+ assert self.parallel_config.world_size == 1, (
34
+ "CPUExecutor only supports single CPU socket currently.")
35
+
36
+ distributed_init_method = get_distributed_init_method(
37
+ get_ip(), get_open_port())
38
+ self.driver_worker = CPUWorker(
39
+ model_config=self.model_config,
40
+ parallel_config=self.parallel_config,
41
+ scheduler_config=self.scheduler_config,
42
+ device_config=self.device_config,
43
+ cache_config=self.cache_config,
44
+ load_config=self.load_config,
45
+ local_rank=0,
46
+ rank=0,
47
+ distributed_init_method=distributed_init_method,
48
+ lora_config=self.lora_config,
49
+ vision_language_config=self.vision_language_config,
50
+ kv_cache_dtype=self.cache_config.cache_dtype,
51
+ is_driver_worker=True,
52
+ )
53
+ self.driver_worker.init_device()
54
+ self.driver_worker.load_model()
55
+
56
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
57
+ """Determine the number of available KV blocks by invoking the
58
+ underlying worker.
59
+ """
60
+ return self.driver_worker.determine_num_available_blocks()
61
+
62
+ def initialize_cache(self, num_gpu_blocks: int,
63
+ num_cpu_blocks: int) -> None:
64
+ """Initialize the KV cache by invoking the underlying worker.
65
+ """
66
+ # NOTE: We log here to avoid multiple logs when number of workers is
67
+ # greater than one. We could log in the engine, but not all executors
68
+ # have GPUs.
69
+ # NOTE: `cpu block` for CPU backend is located on CPU memory but is
70
+ # referred as `gpu block`. Because we want to reuse the existing block
71
+ # management procedure.
72
+ logger.info("# CPU blocks: %d", num_gpu_blocks)
73
+ self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
74
+
75
+ def execute_model(
76
+ self,
77
+ execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
78
+ output = self.driver_worker.execute_model(execute_model_req)
79
+ return output
80
+
81
+ def add_lora(self, lora_request: LoRARequest) -> bool:
82
+ return self.driver_worker.add_lora(lora_request)
83
+
84
+ def remove_lora(self, lora_id: int) -> bool:
85
+ return self.driver_worker.remove_lora(lora_id)
86
+
87
+ def list_loras(self) -> Set[int]:
88
+ return self.driver_worker.list_loras()
89
+
90
+ def check_health(self) -> None:
91
+ # CPUExecutor will always be healthy as long as
92
+ # it's running.
93
+ return
94
+
95
+
96
+ class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
97
+
98
+ async def execute_model_async(
99
+ self,
100
+ execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
101
+ output = await make_async(self.driver_worker.execute_model
102
+ )(execute_model_req=execute_model_req, )
103
+ return output
104
+
105
+ async def check_health_async(self) -> None:
106
+ # CPUExecutor will always be healthy as long as
107
+ # it's running.
108
+ return
109
+
110
+
111
+ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
112
+ if config.dtype == torch.float16:
113
+ logger.warning("float16 is not supported on CPU, casting to bfloat16.")
114
+ config.dtype = torch.bfloat16
115
+ if not config.enforce_eager:
116
+ logger.warning(
117
+ "CUDA graph is not supported on CPU, fallback to the eager "
118
+ "mode.")
119
+ config.enforce_eager = True
120
+ return config
121
+
122
+
123
+ def _verify_and_get_scheduler_config(
124
+ config: SchedulerConfig) -> SchedulerConfig:
125
+ if config.chunked_prefill_enabled:
126
+ logger.warning("Chunked prefill is not supported on CPU, disable it.")
127
+ config.chunked_prefill_enabled = False
128
+
129
+ return config
130
+
131
+
132
+ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
133
+ _GB = 1 << 30
134
+ if config.enable_prefix_caching:
135
+ logger.warning("Prefix caching is not supported on CPU, disable it.")
136
+ config.enable_prefix_caching = False
137
+
138
+ kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
139
+
140
+ if kv_cache_space >= 0:
141
+ if kv_cache_space == 0:
142
+ config.cpu_kvcache_space_bytes = 4 * _GB # type: ignore
143
+ logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
144
+ "for CPU backend is not set, using 4 by default.")
145
+ else:
146
+ config.cpu_kvcache_space_bytes = kv_cache_space * _GB # type: ignore
147
+ else:
148
+ raise RuntimeError(
149
+ "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
150
+ f" {kv_cache_space}, expect a positive integer value.")
151
+
152
+ return config
@@ -0,0 +1,115 @@
1
+ from abc import abstractmethod
2
+ from typing import Any, Dict, List, Optional, Set, Tuple
3
+
4
+ from vllm.executor.executor_base import ExecutorAsyncBase
5
+ from vllm.executor.gpu_executor import GPUExecutor
6
+ from vllm.logger import init_logger
7
+ from vllm.lora.request import LoRARequest
8
+ from vllm.sequence import SamplerOutput
9
+
10
+ logger = init_logger(__name__)
11
+
12
+
13
+ class DistributedGPUExecutor(GPUExecutor):
14
+ """Abstract superclass of multi-GPU executor implementations."""
15
+
16
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
17
+ """Determine the number of available KV blocks.
18
+
19
+ This invokes `determine_num_available_blocks` on each worker and takes
20
+ the min of the results, guaranteeing that the selected cache sizes are
21
+ compatible with all workers.
22
+
23
+ Returns:
24
+ - tuple[num_gpu_blocks, num_cpu_blocks]
25
+ """
26
+ # Get the maximum number of blocks that can be allocated on GPU and CPU.
27
+ num_blocks = self._run_workers("determine_num_available_blocks", )
28
+
29
+ # Since we use a shared centralized controller, we take the minimum
30
+ # number of blocks across all workers to make sure all the memory
31
+ # operators can be applied to all workers.
32
+ num_gpu_blocks = min(b[0] for b in num_blocks)
33
+ num_cpu_blocks = min(b[1] for b in num_blocks)
34
+
35
+ return num_gpu_blocks, num_cpu_blocks
36
+
37
+ def initialize_cache(self, num_gpu_blocks: int,
38
+ num_cpu_blocks: int) -> None:
39
+ """Initialize the KV cache in all workers.
40
+ """
41
+
42
+ # NOTE: We log here to avoid multiple logs when number of workers is
43
+ # greater than one. We could log in the engine, but not all executors
44
+ # have GPUs.
45
+ logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
46
+ num_cpu_blocks)
47
+
48
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
49
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
50
+
51
+ self._run_workers("initialize_cache",
52
+ num_gpu_blocks=num_gpu_blocks,
53
+ num_cpu_blocks=num_cpu_blocks)
54
+
55
+ def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
56
+ all_outputs = self._run_workers("execute_model",
57
+ driver_args=args,
58
+ driver_kwargs=kwargs)
59
+
60
+ # Only the driver worker returns the sampling results.
61
+ return all_outputs[0]
62
+
63
+ def add_lora(self, lora_request: LoRARequest) -> bool:
64
+ assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
65
+ return self._run_workers(
66
+ "add_lora",
67
+ lora_request=lora_request,
68
+ )
69
+
70
+ def remove_lora(self, lora_id: int) -> bool:
71
+ assert lora_id > 0, "lora_id must be greater than 0."
72
+ return self._run_workers(
73
+ "remove_lora",
74
+ lora_id=lora_id,
75
+ )
76
+
77
+ def list_loras(self) -> Set[int]:
78
+ return self._run_workers("list_loras")
79
+
80
+ @abstractmethod
81
+ def _run_workers(
82
+ self,
83
+ method: str,
84
+ *args,
85
+ driver_args: Optional[Tuple[Any, ...]] = None,
86
+ driver_kwargs: Optional[Dict[str, Any]] = None,
87
+ max_concurrent_workers: Optional[int] = None,
88
+ **kwargs,
89
+ ) -> Any:
90
+ """Runs the given method on all workers."""
91
+ raise NotImplementedError
92
+
93
+
94
+ class DistributedGPUExecutorAsync(DistributedGPUExecutor, ExecutorAsyncBase):
95
+
96
+ @abstractmethod
97
+ async def _run_workers_async(
98
+ self,
99
+ method: str,
100
+ *args,
101
+ driver_args: Optional[Tuple[Any, ...]] = None,
102
+ driver_kwargs: Optional[Dict[str, Any]] = None,
103
+ **kwargs,
104
+ ) -> Any:
105
+ """Runs the given method on all workers."""
106
+ raise NotImplementedError
107
+
108
+ async def execute_model_async(self, *args,
109
+ **kwargs) -> List[SamplerOutput]:
110
+ all_outputs = await self._run_workers_async("execute_model",
111
+ driver_args=args,
112
+ driver_kwargs=kwargs)
113
+
114
+ # Only the driver worker returns the sampling results.
115
+ return all_outputs[0]
@@ -0,0 +1,115 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Optional, Set, Tuple
3
+
4
+ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
5
+ ModelConfig, ParallelConfig, SchedulerConfig,
6
+ SpeculativeConfig, VisionLanguageConfig)
7
+ from vllm.lora.request import LoRARequest
8
+ from vllm.sequence import ExecuteModelRequest, SamplerOutput
9
+
10
+
11
+ class ExecutorBase(ABC):
12
+ """Base class for all executors.
13
+
14
+ An executor is responsible for executing the model on a specific device
15
+ type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
16
+ that can execute the model on multiple devices.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ model_config: ModelConfig,
22
+ cache_config: CacheConfig,
23
+ parallel_config: ParallelConfig,
24
+ scheduler_config: SchedulerConfig,
25
+ device_config: DeviceConfig,
26
+ load_config: LoadConfig,
27
+ lora_config: Optional[LoRAConfig],
28
+ vision_language_config: Optional[VisionLanguageConfig],
29
+ speculative_config: Optional[SpeculativeConfig],
30
+ ) -> None:
31
+ self.model_config = model_config
32
+ self.cache_config = cache_config
33
+ self.lora_config = lora_config
34
+ self.load_config = load_config
35
+ self.parallel_config = parallel_config
36
+ self.scheduler_config = scheduler_config
37
+ self.device_config = device_config
38
+ self.vision_language_config = vision_language_config
39
+ self.speculative_config = speculative_config
40
+
41
+ self._init_executor()
42
+
43
+ @abstractmethod
44
+ def _init_executor(self) -> None:
45
+ pass
46
+
47
+ @abstractmethod
48
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
49
+ """Determine the number of available blocks for the GPU KV cache and
50
+ swappable CPU KV cache.
51
+
52
+ Normally, this should simply delegate to the underlying Worker. Some
53
+ ExecutorBase may require modification of the result, e.g. to ensure the
54
+ selected cache sizes are compatible with all workers.
55
+
56
+ Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
57
+ are blocks that are "active" on the device and can be appended to.
58
+ num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
59
+ appended to.
60
+ """
61
+ raise NotImplementedError
62
+
63
+ @abstractmethod
64
+ def initialize_cache(self, num_gpu_blocks: int,
65
+ num_cpu_blocks: int) -> None:
66
+ """Initialize the KV cache with the given size in blocks.
67
+ """
68
+ raise NotImplementedError
69
+
70
+ @abstractmethod
71
+ def execute_model(
72
+ self,
73
+ execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
74
+ """Executes at least one model step on the given sequences."""
75
+ raise NotImplementedError
76
+
77
+ @abstractmethod
78
+ def add_lora(self, lora_request: LoRARequest) -> bool:
79
+ raise NotImplementedError
80
+
81
+ @abstractmethod
82
+ def remove_lora(self, lora_id: int) -> bool:
83
+ raise NotImplementedError
84
+
85
+ @abstractmethod
86
+ def list_loras(self) -> Set[int]:
87
+ raise NotImplementedError
88
+
89
+ @abstractmethod
90
+ def check_health(self) -> None:
91
+ """Checks if the executor is healthy. If not, it should raise an
92
+ exception."""
93
+ raise NotImplementedError
94
+
95
+ def shutdown(self) -> None:
96
+ """Shutdown the executor."""
97
+ return
98
+
99
+ def __del__(self):
100
+ self.shutdown()
101
+
102
+
103
+ class ExecutorAsyncBase(ExecutorBase):
104
+
105
+ @abstractmethod
106
+ async def execute_model_async(
107
+ self,
108
+ execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
109
+ """Executes one model step on the given sequences."""
110
+ raise NotImplementedError
111
+
112
+ async def check_health_async(self) -> None:
113
+ """Checks if the executor is healthy. If not, it should raise an
114
+ exception."""
115
+ self.check_health()