vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
vllm/utils.py ADDED
@@ -0,0 +1,677 @@
1
+ import asyncio
2
+ import datetime
3
+ import enum
4
+ import gc
5
+ import glob
6
+ import os
7
+ import socket
8
+ import subprocess
9
+ import tempfile
10
+ import threading
11
+ import uuid
12
+ import warnings
13
+ from collections import defaultdict
14
+ from functools import lru_cache, partial
15
+ from platform import uname
16
+ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
17
+ Hashable, List, Optional, OrderedDict, Tuple, TypeVar,
18
+ Union)
19
+
20
+ import psutil
21
+ import torch
22
+ from packaging.version import Version, parse
23
+
24
+ import vllm.envs as envs
25
+ from vllm.logger import enable_trace_function_call, init_logger
26
+
27
+ T = TypeVar("T")
28
+ logger = init_logger(__name__)
29
+
30
+ STR_DTYPE_TO_TORCH_DTYPE = {
31
+ "half": torch.half,
32
+ "bfloat16": torch.bfloat16,
33
+ "float": torch.float,
34
+ "fp8": torch.uint8,
35
+ }
36
+
37
+
38
+ class Device(enum.Enum):
39
+ GPU = enum.auto()
40
+ CPU = enum.auto()
41
+
42
+
43
+ class Counter:
44
+
45
+ def __init__(self, start: int = 0) -> None:
46
+ self.counter = start
47
+
48
+ def __next__(self) -> int:
49
+ i = self.counter
50
+ self.counter += 1
51
+ return i
52
+
53
+ def reset(self) -> None:
54
+ self.counter = 0
55
+
56
+
57
+ class LRUCache(Generic[T]):
58
+
59
+ def __init__(self, capacity: int):
60
+ self.cache: OrderedDict[Hashable, T] = OrderedDict()
61
+ self.capacity = capacity
62
+
63
+ def __contains__(self, key: Hashable) -> bool:
64
+ return key in self.cache
65
+
66
+ def __len__(self) -> int:
67
+ return len(self.cache)
68
+
69
+ def __getitem__(self, key: Hashable) -> Optional[T]:
70
+ return self.get(key)
71
+
72
+ def __setitem__(self, key: Hashable, value: T) -> None:
73
+ self.put(key, value)
74
+
75
+ def __delitem__(self, key: Hashable) -> None:
76
+ self.pop(key)
77
+
78
+ def touch(self, key: Hashable) -> None:
79
+ self.cache.move_to_end(key)
80
+
81
+ def get(self,
82
+ key: Hashable,
83
+ default_value: Optional[T] = None) -> Optional[T]:
84
+ if key in self.cache:
85
+ value: Optional[T] = self.cache[key]
86
+ self.cache.move_to_end(key)
87
+ else:
88
+ value = default_value
89
+ return value
90
+
91
+ def put(self, key: Hashable, value: T) -> None:
92
+ self.cache[key] = value
93
+ self.cache.move_to_end(key)
94
+ self._remove_old_if_needed()
95
+
96
+ def _on_remove(self, key: Hashable, value: Optional[T]):
97
+ pass
98
+
99
+ def remove_oldest(self):
100
+ if not self.cache:
101
+ return
102
+ key, value = self.cache.popitem(last=False)
103
+ self._on_remove(key, value)
104
+
105
+ def _remove_old_if_needed(self) -> None:
106
+ while len(self.cache) > self.capacity:
107
+ self.remove_oldest()
108
+
109
+ def pop(self,
110
+ key: Hashable,
111
+ default_value: Optional[T] = None) -> Optional[T]:
112
+ run_on_remove = key in self.cache
113
+ value: Optional[T] = self.cache.pop(key, default_value)
114
+ if run_on_remove:
115
+ self._on_remove(key, value)
116
+ return value
117
+
118
+ def clear(self):
119
+ while len(self.cache) > 0:
120
+ self.remove_oldest()
121
+ self.cache.clear()
122
+
123
+
124
+ def is_hip() -> bool:
125
+ return torch.version.hip is not None
126
+
127
+
128
+ @lru_cache(maxsize=None)
129
+ def is_cpu() -> bool:
130
+ from importlib.metadata import PackageNotFoundError, version
131
+ try:
132
+ return "cpu" in version("vllm")
133
+ except PackageNotFoundError:
134
+ return False
135
+
136
+
137
+ @lru_cache(maxsize=None)
138
+ def is_neuron() -> bool:
139
+ try:
140
+ import transformers_neuronx
141
+ except ImportError:
142
+ transformers_neuronx = None
143
+ return transformers_neuronx is not None
144
+
145
+
146
+ @lru_cache(maxsize=None)
147
+ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
148
+ """Returns the maximum shared memory per thread block in bytes."""
149
+ # NOTE: This import statement should be executed lazily since
150
+ # the Neuron-X backend does not have the `cuda_utils` module.
151
+ from vllm._C import cuda_utils
152
+
153
+ max_shared_mem = (
154
+ cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
155
+ # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
156
+ # will fail
157
+ assert max_shared_mem > 0, "max_shared_mem can not be zero"
158
+ return int(max_shared_mem)
159
+
160
+
161
+ def get_cpu_memory() -> int:
162
+ """Returns the total CPU memory of the node in bytes."""
163
+ return psutil.virtual_memory().total
164
+
165
+
166
+ def random_uuid() -> str:
167
+ return str(uuid.uuid4().hex)
168
+
169
+
170
+ @lru_cache(maxsize=None)
171
+ def get_vllm_instance_id():
172
+ """
173
+ If the environment variable VLLM_INSTANCE_ID is set, return it.
174
+ Otherwise, return a random UUID.
175
+ Instance id represents an instance of the VLLM. All processes in the same
176
+ instance should have the same instance id.
177
+ """
178
+ return envs.VLLM_INSTANCE_ID or f"vllm-instance-{random_uuid()}"
179
+
180
+
181
+ @lru_cache(maxsize=None)
182
+ def in_wsl() -> bool:
183
+ # Reference: https://github.com/microsoft/WSL/issues/4071
184
+ return "microsoft" in " ".join(uname()).lower()
185
+
186
+
187
+ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
188
+ """Take a blocking function, and run it on in an executor thread.
189
+
190
+ This function prevents the blocking function from blocking the
191
+ asyncio event loop.
192
+ The code in this function needs to be thread safe.
193
+ """
194
+
195
+ def _async_wrapper(*args, **kwargs) -> asyncio.Future:
196
+ loop = asyncio.get_event_loop()
197
+ p_func = partial(func, *args, **kwargs)
198
+ return loop.run_in_executor(executor=None, func=p_func)
199
+
200
+ return _async_wrapper
201
+
202
+
203
+ def merge_async_iterators(
204
+ *iterators: AsyncIterator[T]) -> AsyncIterator[Tuple[int, T]]:
205
+ """Merge multiple asynchronous iterators into a single iterator.
206
+
207
+ This method handle the case where some iterators finish before others.
208
+ When it yields, it yields a tuple (i, item) where i is the index of the
209
+ iterator that yields the item.
210
+ """
211
+ queue: asyncio.Queue[Union[Tuple[int, T], Exception]] = asyncio.Queue()
212
+
213
+ finished = [False] * len(iterators)
214
+
215
+ async def producer(i: int, iterator: AsyncIterator[T]):
216
+ try:
217
+ async for item in iterator:
218
+ await queue.put((i, item))
219
+ except Exception as e:
220
+ await queue.put(e)
221
+ finished[i] = True
222
+
223
+ _tasks = [
224
+ asyncio.create_task(producer(i, iterator))
225
+ for i, iterator in enumerate(iterators)
226
+ ]
227
+
228
+ async def consumer():
229
+ try:
230
+ while not all(finished) or not queue.empty():
231
+ item = await queue.get()
232
+ if isinstance(item, Exception):
233
+ raise item
234
+ yield item
235
+ except (Exception, asyncio.CancelledError) as e:
236
+ for task in _tasks:
237
+ # NOTE: Pass the error msg in cancel()
238
+ # when only Python 3.9+ is supported.
239
+ task.cancel()
240
+ raise e
241
+ await asyncio.gather(*_tasks)
242
+
243
+ return consumer()
244
+
245
+
246
+ def get_ip() -> str:
247
+ host_ip = envs.VLLM_HOST_IP
248
+ if host_ip:
249
+ return host_ip
250
+
251
+ # IP is not set, try to get it from the network interface
252
+
253
+ # try ipv4
254
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
255
+ try:
256
+ s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable
257
+ return s.getsockname()[0]
258
+ except Exception:
259
+ pass
260
+
261
+ # try ipv6
262
+ try:
263
+ s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
264
+ # Google's public DNS server, see
265
+ # https://developers.google.com/speed/public-dns/docs/using#addresses
266
+ s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
267
+ return s.getsockname()[0]
268
+ except Exception:
269
+ pass
270
+
271
+ warnings.warn(
272
+ "Failed to get the IP address, using 0.0.0.0 by default."
273
+ "The value can be set by the environment variable"
274
+ " VLLM_HOST_IP or HOST_IP.",
275
+ stacklevel=2)
276
+ return "0.0.0.0"
277
+
278
+
279
+ def get_distributed_init_method(ip: str, port: int) -> str:
280
+ # Brackets are not permitted in ipv4 addresses,
281
+ # see https://github.com/python/cpython/issues/103848
282
+ return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
283
+
284
+
285
+ def get_open_port() -> int:
286
+ # try ipv4
287
+ try:
288
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
289
+ s.bind(("", 0))
290
+ return s.getsockname()[1]
291
+ except OSError:
292
+ # try ipv6
293
+ with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
294
+ s.bind(("", 0))
295
+ return s.getsockname()[1]
296
+
297
+
298
+ def update_environment_variables(envs: Dict[str, str]):
299
+ for k, v in envs.items():
300
+ if k in os.environ and os.environ[k] != v:
301
+ logger.warning(
302
+ "Overwriting environment variable %s "
303
+ "from '%s' to '%s'", k, os.environ[k], v)
304
+ os.environ[k] = v
305
+
306
+
307
+ def chunk_list(lst, chunk_size):
308
+ """Yield successive chunk_size chunks from lst."""
309
+ return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
310
+
311
+
312
+ def cdiv(a: int, b: int) -> int:
313
+ """Ceiling division."""
314
+ return -(a // -b)
315
+
316
+
317
+ @lru_cache(maxsize=None)
318
+ def get_nvcc_cuda_version() -> Optional[Version]:
319
+ cuda_home = envs.CUDA_HOME
320
+ if not cuda_home:
321
+ cuda_home = '/usr/local/cuda'
322
+ if os.path.isfile(cuda_home + '/bin/nvcc'):
323
+ logger.info(
324
+ 'CUDA_HOME is not found in the environment. '
325
+ 'Using %s as CUDA_HOME.', cuda_home)
326
+ else:
327
+ logger.warning('Not found nvcc in %s. Skip cuda version check!',
328
+ cuda_home)
329
+ return None
330
+ nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
331
+ universal_newlines=True)
332
+ output = nvcc_output.split()
333
+ release_idx = output.index("release") + 1
334
+ nvcc_cuda_version = parse(output[release_idx].split(",")[0])
335
+ return nvcc_cuda_version
336
+
337
+
338
+ def _generate_random_fp8(
339
+ tensor: torch.tensor,
340
+ low: float,
341
+ high: float,
342
+ ) -> None:
343
+ # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
344
+ # it may occur Inf or NaN if we directly use torch.randint
345
+ # to generate random data for fp8 data.
346
+ # For example, s.11111.00 in fp8e5m2 format represents Inf.
347
+ # | E4M3 | E5M2
348
+ #-----|-------------|-------------------
349
+ # Inf | N/A | s.11111.00
350
+ # NaN | s.1111.111 | s.11111.{01,10,11}
351
+ from vllm import _custom_ops as ops
352
+ tensor_tmp = torch.empty_like(tensor, dtype=torch.float16)
353
+ tensor_tmp.uniform_(low, high)
354
+ ops.convert_fp8(tensor_tmp, tensor)
355
+ del tensor_tmp
356
+
357
+
358
+ def get_kv_cache_torch_dtype(
359
+ cache_dtype: Optional[Union[str, torch.dtype]],
360
+ model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype:
361
+ if isinstance(cache_dtype, str):
362
+ if cache_dtype == "auto":
363
+ if isinstance(model_dtype, str):
364
+ torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
365
+ elif isinstance(model_dtype, torch.dtype):
366
+ torch_dtype = model_dtype
367
+ else:
368
+ raise ValueError(f"Invalid model dtype: {model_dtype}")
369
+ elif cache_dtype in ["half", "bfloat16", "float"]:
370
+ torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
371
+ elif cache_dtype == "fp8":
372
+ torch_dtype = torch.uint8
373
+ else:
374
+ raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
375
+ elif isinstance(cache_dtype, torch.dtype):
376
+ torch_dtype = cache_dtype
377
+ else:
378
+ raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
379
+ return torch_dtype
380
+
381
+
382
+ def create_kv_caches_with_random_flash(
383
+ num_blocks: int,
384
+ block_size: int,
385
+ num_layers: int,
386
+ num_heads: int,
387
+ head_size: int,
388
+ cache_dtype: Optional[Union[str, torch.dtype]],
389
+ model_dtype: Optional[Union[str, torch.dtype]] = None,
390
+ seed: int = 0,
391
+ device: Optional[str] = "cuda",
392
+ ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
393
+ assert cache_dtype != "fp8"
394
+ torch.random.manual_seed(seed)
395
+ if torch.cuda.is_available():
396
+ torch.cuda.manual_seed(seed)
397
+
398
+ torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
399
+ key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
400
+ scale = head_size**-0.5
401
+ key_caches, value_caches = [], []
402
+ for _ in range(num_layers):
403
+ key_value_cache = torch.empty(size=key_value_cache_shape,
404
+ dtype=torch_dtype,
405
+ device=device)
406
+ key_value_cache.uniform_(-scale, scale)
407
+ key_caches.append(key_value_cache[:, 0])
408
+ value_caches.append(key_value_cache[:, 1])
409
+ return key_caches, value_caches
410
+
411
+
412
+ def create_kv_caches_with_random(
413
+ num_blocks: int,
414
+ block_size: int,
415
+ num_layers: int,
416
+ num_heads: int,
417
+ head_size: int,
418
+ cache_dtype: Optional[Union[str, torch.dtype]],
419
+ model_dtype: Optional[Union[str, torch.dtype]] = None,
420
+ seed: int = 0,
421
+ device: Optional[str] = "cuda",
422
+ ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
423
+ torch.random.manual_seed(seed)
424
+ if torch.cuda.is_available():
425
+ torch.cuda.manual_seed(seed)
426
+
427
+ torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
428
+
429
+ scale = head_size**-0.5
430
+ x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
431
+ key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
432
+ key_caches = []
433
+ for _ in range(num_layers):
434
+ key_cache = torch.empty(size=key_cache_shape,
435
+ dtype=torch_dtype,
436
+ device=device)
437
+ if cache_dtype in ["auto", "half", "bfloat16", "float"]:
438
+ key_cache.uniform_(-scale, scale)
439
+ elif cache_dtype == 'fp8':
440
+ _generate_random_fp8(key_cache, -scale, scale)
441
+ else:
442
+ raise ValueError(
443
+ f"Does not support key cache of type {cache_dtype}")
444
+ key_caches.append(key_cache)
445
+
446
+ value_cache_shape = (num_blocks, num_heads, head_size, block_size)
447
+ value_caches = []
448
+ for _ in range(num_layers):
449
+ value_cache = torch.empty(size=value_cache_shape,
450
+ dtype=torch_dtype,
451
+ device=device)
452
+ if cache_dtype in ["auto", "half", "bfloat16", "float"]:
453
+ value_cache.uniform_(-scale, scale)
454
+ elif cache_dtype == 'fp8':
455
+ _generate_random_fp8(value_cache, -scale, scale)
456
+ else:
457
+ raise ValueError(
458
+ f"Does not support value cache of type {cache_dtype}")
459
+ value_caches.append(value_cache)
460
+ return key_caches, value_caches
461
+
462
+
463
+ @lru_cache
464
+ def print_warning_once(msg: str) -> None:
465
+ logger.warning(msg)
466
+
467
+
468
+ @lru_cache(maxsize=None)
469
+ def is_pin_memory_available() -> bool:
470
+
471
+ if in_wsl():
472
+ # Pinning memory in WSL is not supported.
473
+ # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
474
+ print_warning_once("Using 'pin_memory=False' as WSL is detected. "
475
+ "This may slow down the performance.")
476
+ return False
477
+ elif is_neuron():
478
+ print_warning_once("Pin memory is not supported on Neuron.")
479
+ return False
480
+ elif is_cpu():
481
+ return False
482
+ return True
483
+
484
+
485
+ class CudaMemoryProfiler:
486
+
487
+ def __init__(self, device=None):
488
+ self.device = device
489
+
490
+ def current_memory_usage(self) -> float:
491
+ # Return the memory usage in bytes.
492
+ torch.cuda.reset_peak_memory_stats(self.device)
493
+ mem = torch.cuda.max_memory_allocated(self.device)
494
+ return mem
495
+
496
+ def __enter__(self):
497
+ self.initial_memory = self.current_memory_usage()
498
+ # This allows us to call methods of the context manager if needed
499
+ return self
500
+
501
+ def __exit__(self, exc_type, exc_val, exc_tb):
502
+ self.final_memory = self.current_memory_usage()
503
+ self.consumed_memory = self.final_memory - self.initial_memory
504
+
505
+ # Force garbage collection
506
+ gc.collect()
507
+
508
+
509
+ def str_to_int_tuple(s: str) -> Tuple[int, ...]:
510
+ """Convert a string to a tuple of integers."""
511
+ try:
512
+ return tuple(map(int, s.split(",")))
513
+ except ValueError as e:
514
+ raise ValueError(
515
+ "String must be a series of integers separated by commas "
516
+ f"(e.g., 1, 2, 3). Given input: {s}") from e
517
+
518
+
519
+ def pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]:
520
+ assert len(x) <= max_len
521
+ return x + [pad] * (max_len - len(x))
522
+
523
+
524
+ def make_tensor_with_pad(
525
+ x: List[List[int]],
526
+ max_len: int,
527
+ pad: int,
528
+ dtype: torch.dtype,
529
+ device: Optional[Union[str, torch.device]],
530
+ ) -> torch.Tensor:
531
+ """Make a padded tensor of a 2D inputs.
532
+
533
+ The padding is applied to the end of each inner list until it reaches
534
+ `max_len`.
535
+ """
536
+ padded_x = [pad_to_max_length(x_i, max_len, pad) for x_i in x]
537
+ return torch.tensor(padded_x, dtype=dtype, device=device)
538
+
539
+
540
+ def async_tensor_h2d(
541
+ data: list,
542
+ dtype: torch.dtype,
543
+ target_device: Union[str, torch.device],
544
+ pin_memory: bool,
545
+ ) -> torch.Tensor:
546
+ """Asynchronously create a tensor and copy it from host to device."""
547
+ t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
548
+ return t.to(device=target_device, non_blocking=True)
549
+
550
+
551
+ def maybe_expand_dim(tensor: torch.Tensor,
552
+ target_dims: int,
553
+ size: int = 1) -> torch.Tensor:
554
+ """Expand the tensor to the target_dims."""
555
+ if tensor.ndim < target_dims:
556
+ tensor = tensor.view(-1, *([size] * (target_dims - tensor.ndim)))
557
+ return tensor
558
+
559
+
560
+ def merge_dicts(dict1: Dict[Any, List[Any]],
561
+ dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
562
+ """Merge 2 dicts that have key -> List of items.
563
+
564
+ When a key conflicts, the values in dict1 is prioritized.
565
+ """
566
+ merged_dict = defaultdict(list)
567
+
568
+ for key, value in dict1.items():
569
+ merged_dict[key].extend(value)
570
+
571
+ for key, value in dict2.items():
572
+ merged_dict[key].extend(value)
573
+
574
+ return dict(merged_dict)
575
+
576
+
577
+ def init_cached_hf_modules():
578
+ """
579
+ Lazy initialization of the Hugging Face modules.
580
+ """
581
+ from transformers.dynamic_module_utils import init_hf_modules
582
+ init_hf_modules()
583
+
584
+
585
+ def nccl_integrity_check(filepath):
586
+ """
587
+ when the library is corrupted, we cannot catch
588
+ the exception in python. it will crash the process.
589
+ instead, we use the exit code of `ldd` to check
590
+ if the library is corrupted. if not, we will return
591
+ the version of the library.
592
+ """
593
+ exit_code = os.system(f"ldd {filepath} 2>&1 > /dev/null")
594
+ if exit_code != 0:
595
+ raise RuntimeError(f"Failed to load NCCL library from {filepath} .")
596
+ import ctypes
597
+
598
+ nccl = ctypes.CDLL(filepath)
599
+ version = ctypes.c_int()
600
+ nccl.ncclGetVersion.restype = ctypes.c_int
601
+ nccl.ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
602
+ result = nccl.ncclGetVersion(ctypes.byref(version))
603
+ assert result == 0
604
+ return version.value
605
+
606
+
607
+ @lru_cache(maxsize=None)
608
+ def find_library(lib_name: str) -> str:
609
+ """
610
+ Find the library file in the system.
611
+ `lib_name` is full filename, with both prefix and suffix.
612
+ This function resolves `lib_name` to the full path of the library.
613
+ """
614
+ # Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
615
+ # According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
616
+ # `/sbin/ldconfig` should exist in all Linux systems.
617
+ # `/sbin/ldconfig` searches the library in the system
618
+ libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
619
+ # each line looks like the following:
620
+ # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
621
+ locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
622
+ # `LD_LIBRARY_PATH` searches the library in the user-defined paths
623
+ env_ld_library_path = envs.LD_LIBRARY_PATH
624
+ if not locs and env_ld_library_path:
625
+ locs = [
626
+ os.path.join(dir, lib_name)
627
+ for dir in env_ld_library_path.split(":")
628
+ if os.path.exists(os.path.join(dir, lib_name))
629
+ ]
630
+ if not locs:
631
+ raise ValueError(f"Cannot find {lib_name} in the system.")
632
+ return locs[0]
633
+
634
+
635
+ def find_nccl_library():
636
+ so_file = envs.VLLM_NCCL_SO_PATH
637
+ VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
638
+
639
+ # check if we have vllm-managed nccl
640
+ vllm_nccl_path = None
641
+ if torch.version.cuda is not None:
642
+ cuda_major = torch.version.cuda.split(".")[0]
643
+ path = os.path.expanduser(
644
+ f"{VLLM_CONFIG_ROOT}/vllm/nccl/cu{cuda_major}/libnccl.so.*")
645
+ files = glob.glob(path)
646
+ vllm_nccl_path = files[0] if files else None
647
+
648
+ # manually load the nccl library
649
+ if so_file:
650
+ logger.info(
651
+ "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s",
652
+ so_file)
653
+ else:
654
+ if torch.version.cuda is not None:
655
+ so_file = vllm_nccl_path or find_library("libnccl.so.2")
656
+ elif torch.version.hip is not None:
657
+ so_file = find_library("librccl.so.1")
658
+ else:
659
+ raise ValueError("NCCL only supports CUDA and ROCm backends.")
660
+ logger.info("Found nccl from library %s", so_file)
661
+ return so_file
662
+
663
+
664
+ def enable_trace_function_call_for_thread() -> None:
665
+ """Set up function tracing for the current thread,
666
+ if enabled via the VLLM_TRACE_FUNCTION environment variable
667
+ """
668
+
669
+ if envs.VLLM_TRACE_FUNCTION:
670
+ tmp_dir = tempfile.gettempdir()
671
+ filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
672
+ f"_thread_{threading.get_ident()}_"
673
+ f"at_{datetime.datetime.now()}.log").replace(" ", "_")
674
+ log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(),
675
+ filename)
676
+ os.makedirs(os.path.dirname(log_path), exist_ok=True)
677
+ enable_trace_function_call(log_path)
File without changes