vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,209 @@
1
+ import datetime
2
+ import json
3
+ import logging
4
+ import os
5
+ import platform
6
+ import time
7
+ from enum import Enum
8
+ from pathlib import Path
9
+ from threading import Thread
10
+ from typing import Any, Dict, Optional
11
+ from uuid import uuid4
12
+
13
+ import cpuinfo
14
+ import psutil
15
+ import requests
16
+ import torch
17
+
18
+ import vllm.envs as envs
19
+
20
+ _config_home = envs.VLLM_CONFIG_ROOT
21
+ _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
22
+ _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
23
+ "vllm/do_not_track")
24
+ _USAGE_STATS_ENABLED = None
25
+ _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
26
+
27
+
28
+ def is_usage_stats_enabled():
29
+ """Determine whether or not we can send usage stats to the server.
30
+ The logic is as follows:
31
+ - By default, it should be enabled.
32
+ - Three environment variables can disable it:
33
+ - VLLM_DO_NOT_TRACK=1
34
+ - DO_NOT_TRACK=1
35
+ - VLLM_NO_USAGE_STATS=1
36
+ - A file in the home directory can disable it if it exists:
37
+ - $HOME/.config/vllm/do_not_track
38
+ """
39
+ global _USAGE_STATS_ENABLED
40
+ if _USAGE_STATS_ENABLED is None:
41
+ do_not_track = envs.VLLM_DO_NOT_TRACK
42
+ no_usage_stats = envs.VLLM_NO_USAGE_STATS
43
+ do_not_track_file = os.path.exists(_USAGE_STATS_DO_NOT_TRACK_PATH)
44
+
45
+ _USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats
46
+ or do_not_track_file)
47
+ return _USAGE_STATS_ENABLED
48
+
49
+
50
+ def _get_current_timestamp_ns() -> int:
51
+ return int(datetime.datetime.now(datetime.timezone.utc).timestamp() * 1e9)
52
+
53
+
54
+ def _detect_cloud_provider() -> str:
55
+ # Try detecting through vendor file
56
+ vendor_files = [
57
+ "/sys/class/dmi/id/product_version", "/sys/class/dmi/id/bios_vendor",
58
+ "/sys/class/dmi/id/product_name",
59
+ "/sys/class/dmi/id/chassis_asset_tag", "/sys/class/dmi/id/sys_vendor"
60
+ ]
61
+ # Mapping of identifiable strings to cloud providers
62
+ cloud_identifiers = {
63
+ "amazon": "AWS",
64
+ "microsoft corporation": "AZURE",
65
+ "google": "GCP",
66
+ "oraclecloud": "OCI",
67
+ }
68
+
69
+ for vendor_file in vendor_files:
70
+ path = Path(vendor_file)
71
+ if path.is_file():
72
+ file_content = path.read_text().lower()
73
+ for identifier, provider in cloud_identifiers.items():
74
+ if identifier in file_content:
75
+ return provider
76
+
77
+ # Try detecting through environment variables
78
+ env_to_cloud_provider = {
79
+ "RUNPOD_DC_ID": "RUNPOD",
80
+ }
81
+ for env_var, provider in env_to_cloud_provider.items():
82
+ if os.environ.get(env_var):
83
+ return provider
84
+
85
+ return "UNKNOWN"
86
+
87
+
88
+ class UsageContext(str, Enum):
89
+ UNKNOWN_CONTEXT = "UNKNOWN_CONTEXT"
90
+ LLM_CLASS = "LLM_CLASS"
91
+ API_SERVER = "API_SERVER"
92
+ OPENAI_API_SERVER = "OPENAI_API_SERVER"
93
+ ENGINE_CONTEXT = "ENGINE_CONTEXT"
94
+
95
+
96
+ class UsageMessage:
97
+ """Collect platform information and send it to the usage stats server."""
98
+
99
+ def __init__(self) -> None:
100
+ # NOTE: vLLM's server _only_ support flat KV pair.
101
+ # Do not use nested fields.
102
+
103
+ self.uuid = str(uuid4())
104
+
105
+ # Environment Information
106
+ self.provider: Optional[str] = None
107
+ self.num_cpu: Optional[int] = None
108
+ self.cpu_type: Optional[str] = None
109
+ self.cpu_family_model_stepping: Optional[str] = None
110
+ self.total_memory: Optional[int] = None
111
+ self.architecture: Optional[str] = None
112
+ self.platform: Optional[str] = None
113
+ self.gpu_count: Optional[int] = None
114
+ self.gpu_type: Optional[str] = None
115
+ self.gpu_memory_per_device: Optional[int] = None
116
+
117
+ # vLLM Information
118
+ self.model_architecture: Optional[str] = None
119
+ self.vllm_version: Optional[str] = None
120
+ self.context: Optional[str] = None
121
+
122
+ # Metadata
123
+ self.log_time: Optional[int] = None
124
+ self.source: Optional[str] = None
125
+
126
+ def report_usage(self,
127
+ model_architecture: str,
128
+ usage_context: UsageContext,
129
+ extra_kvs: Optional[Dict[str, Any]] = None) -> None:
130
+ t = Thread(target=self._report_usage_worker,
131
+ args=(model_architecture, usage_context, extra_kvs or {}),
132
+ daemon=True)
133
+ t.start()
134
+
135
+ def _report_usage_worker(self, model_architecture: str,
136
+ usage_context: UsageContext,
137
+ extra_kvs: Dict[str, Any]) -> None:
138
+ self._report_usage_once(model_architecture, usage_context, extra_kvs)
139
+ self._report_continous_usage()
140
+
141
+ def _report_usage_once(self, model_architecture: str,
142
+ usage_context: UsageContext,
143
+ extra_kvs: Dict[str, Any]) -> None:
144
+ # Platform information
145
+ if torch.cuda.is_available():
146
+ device_property = torch.cuda.get_device_properties(0)
147
+ self.gpu_count = torch.cuda.device_count()
148
+ self.gpu_type = device_property.name
149
+ self.gpu_memory_per_device = device_property.total_memory
150
+ self.provider = _detect_cloud_provider()
151
+ self.architecture = platform.machine()
152
+ self.platform = platform.platform()
153
+ self.total_memory = psutil.virtual_memory().total
154
+
155
+ info = cpuinfo.get_cpu_info()
156
+ self.num_cpu = info.get("count", None)
157
+ self.cpu_type = info.get("brand_raw", "")
158
+ self.cpu_family_model_stepping = ",".join([
159
+ str(info.get("family", "")),
160
+ str(info.get("model", "")),
161
+ str(info.get("stepping", ""))
162
+ ])
163
+
164
+ # vLLM information
165
+ import vllm # delayed import to prevent circular import
166
+ self.context = usage_context.value
167
+ self.vllm_version = vllm.__version__
168
+ self.model_architecture = model_architecture
169
+
170
+ # Metadata
171
+ self.log_time = _get_current_timestamp_ns()
172
+ self.source = envs.VLLM_USAGE_SOURCE
173
+
174
+ data = vars(self)
175
+ if extra_kvs:
176
+ data.update(extra_kvs)
177
+
178
+ self._write_to_file(data)
179
+ self._send_to_server(data)
180
+
181
+ def _report_continous_usage(self):
182
+ """Report usage every 10 minutes.
183
+
184
+ This helps us to collect more data points for uptime of vLLM usages.
185
+ This function can also help send over performance metrics over time.
186
+ """
187
+ while True:
188
+ time.sleep(600)
189
+ data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
190
+
191
+ self._write_to_file(data)
192
+ self._send_to_server(data)
193
+
194
+ def _send_to_server(self, data):
195
+ try:
196
+ requests.post(_USAGE_STATS_SERVER, json=data)
197
+ except requests.exceptions.RequestException:
198
+ # silently ignore unless we are using debug log
199
+ logging.debug("Failed to send usage data to server")
200
+
201
+ def _write_to_file(self, data):
202
+ os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
203
+ Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
204
+ with open(_USAGE_STATS_JSON_PATH, "a") as f:
205
+ json.dump(data, f)
206
+ f.write("\n")
207
+
208
+
209
+ usage_message = UsageMessage()