vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,196 @@
1
+ from typing import List, Optional, Tuple
2
+
3
+ import torch
4
+ from torch import nn
5
+
6
+ from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
7
+ SchedulerConfig)
8
+ from vllm.logger import init_logger
9
+ from vllm.model_executor import SamplingMetadata
10
+ from vllm.model_executor.model_loader.neuron import get_neuron_model
11
+ from vllm.sequence import SamplerOutput, SequenceGroupMetadata
12
+ from vllm.utils import is_pin_memory_available, make_tensor_with_pad
13
+
14
+ logger = init_logger(__name__)
15
+
16
+
17
+ class NeuronModelRunner:
18
+
19
+ def __init__(
20
+ self,
21
+ model_config: ModelConfig,
22
+ parallel_config: ParallelConfig,
23
+ scheduler_config: SchedulerConfig,
24
+ device_config: DeviceConfig,
25
+ ):
26
+ self.model_config = model_config
27
+ self.parallel_config = parallel_config
28
+ self.scheduler_config = scheduler_config
29
+
30
+ if model_config is not None and model_config.get_sliding_window():
31
+ logger.warning("Sliding window is not supported on Neuron. "
32
+ "The model will run without sliding window.")
33
+ self.device_config = (device_config
34
+ if device_config is not None else DeviceConfig())
35
+ self.device = self.device_config.device
36
+ self.pin_memory = is_pin_memory_available()
37
+
38
+ # Lazy initialization.
39
+ self.model: nn.Module # initialize after load_model.
40
+
41
+ def load_model(self) -> None:
42
+ self.model = get_neuron_model(self.model_config,
43
+ parallel_config=self.parallel_config,
44
+ scheduler_config=self.scheduler_config)
45
+
46
+ def _prepare_prompt(
47
+ self,
48
+ seq_group_metadata_list: List[SequenceGroupMetadata],
49
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
50
+ assert len(seq_group_metadata_list) > 0
51
+ input_tokens: List[List[int]] = []
52
+ input_positions: List[List[int]] = []
53
+ input_block_ids: List[int] = []
54
+
55
+ seq_lens: List[int] = []
56
+ for seq_group_metadata in seq_group_metadata_list:
57
+ assert seq_group_metadata.is_prompt
58
+ seq_ids = list(seq_group_metadata.seq_data.keys())
59
+ assert len(seq_ids) == 1
60
+ seq_id = seq_ids[0]
61
+
62
+ seq_data = seq_group_metadata.seq_data[seq_id]
63
+ prompt_tokens = seq_data.get_token_ids()
64
+ seq_len = len(prompt_tokens)
65
+ seq_lens.append(seq_len)
66
+
67
+ input_tokens.append(prompt_tokens)
68
+ input_positions.append(list(range(seq_len)))
69
+
70
+ assert seq_group_metadata.block_tables is not None
71
+ block_table = seq_group_metadata.block_tables[seq_id]
72
+ assert len(block_table) == 1
73
+ input_block_ids.append(block_table[0])
74
+
75
+ max_seq_len = max(seq_lens)
76
+ assert max_seq_len > 0
77
+ input_tokens = make_tensor_with_pad(input_tokens,
78
+ max_seq_len,
79
+ pad=0,
80
+ dtype=torch.long,
81
+ device=self.device)
82
+ input_positions = make_tensor_with_pad(input_positions,
83
+ max_seq_len,
84
+ pad=0,
85
+ dtype=torch.long,
86
+ device=self.device)
87
+ input_block_ids = torch.tensor(input_block_ids,
88
+ dtype=torch.long,
89
+ device=self.device)
90
+
91
+ return input_tokens, input_positions, input_block_ids, seq_lens
92
+
93
+ def _prepare_decode(
94
+ self,
95
+ seq_group_metadata_list: List[SequenceGroupMetadata],
96
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
97
+ assert len(seq_group_metadata_list) > 0
98
+ input_tokens: List[List[int]] = []
99
+ input_positions: List[List[int]] = []
100
+ input_block_ids: List[int] = []
101
+ context_lens: List[int] = []
102
+
103
+ for seq_group_metadata in seq_group_metadata_list:
104
+ assert not seq_group_metadata.is_prompt
105
+
106
+ seq_ids = list(seq_group_metadata.seq_data.keys())
107
+
108
+ for seq_id in seq_ids:
109
+ seq_data = seq_group_metadata.seq_data[seq_id]
110
+ generation_token = seq_data.get_last_token_id()
111
+ input_tokens.append([generation_token])
112
+
113
+ seq_len = seq_data.get_len()
114
+ position = seq_len - 1
115
+ input_positions.append([position])
116
+ context_lens.append(seq_len)
117
+
118
+ assert seq_group_metadata.block_tables is not None
119
+ block_table = seq_group_metadata.block_tables[seq_id]
120
+ assert len(block_table) == 1
121
+ input_block_ids.append(block_table[0])
122
+
123
+ input_tokens = make_tensor_with_pad(input_tokens,
124
+ max_len=1,
125
+ pad=0,
126
+ dtype=torch.long,
127
+ device=self.device)
128
+ input_positions = make_tensor_with_pad(input_positions,
129
+ max_len=1,
130
+ pad=0,
131
+ dtype=torch.long,
132
+ device=self.device)
133
+ context_lens = torch.tensor(context_lens,
134
+ dtype=torch.int,
135
+ device=self.device)
136
+ input_block_ids = torch.tensor(input_block_ids,
137
+ dtype=torch.long,
138
+ device=self.device)
139
+
140
+ return input_tokens, input_positions, input_block_ids
141
+
142
+ def prepare_input_tensors(
143
+ self,
144
+ seq_group_metadata_list: List[SequenceGroupMetadata],
145
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
146
+ # NOTE: We assume that all sequences in the group are all prompts or
147
+ # all decodes.
148
+ is_prompt = seq_group_metadata_list[0].is_prompt
149
+ # Prepare input tensors.
150
+ if is_prompt:
151
+ (input_tokens, input_positions, input_block_ids,
152
+ seq_lens) = self._prepare_prompt(seq_group_metadata_list)
153
+ else:
154
+ (input_tokens, input_positions,
155
+ input_block_ids) = self._prepare_decode(seq_group_metadata_list)
156
+ seq_lens = []
157
+ sampling_metadata = SamplingMetadata.prepare(
158
+ seq_group_metadata_list,
159
+ seq_lens,
160
+ # query_lens is not needed if chunked prefill is not
161
+ # supported. Since neuron worker doesn't support chunked prefill
162
+ # just use seq_lens instead.
163
+ seq_lens,
164
+ self.device,
165
+ self.pin_memory)
166
+
167
+ return (input_tokens, input_positions, input_block_ids,
168
+ sampling_metadata)
169
+
170
+ @torch.inference_mode()
171
+ def execute_model(
172
+ self,
173
+ seq_group_metadata_list: List[SequenceGroupMetadata],
174
+ ) -> Optional[SamplerOutput]:
175
+ (input_tokens, input_positions, input_block_ids, sampling_metadata
176
+ ) = self.prepare_input_tensors(seq_group_metadata_list)
177
+
178
+ hidden_states = self.model(
179
+ input_ids=input_tokens,
180
+ positions=input_positions,
181
+ input_block_ids=input_block_ids,
182
+ )
183
+
184
+ # Compute the logits.
185
+ logits = self.model.compute_logits(hidden_states, sampling_metadata)
186
+
187
+ # Sample the next token.
188
+ output = self.model.sample(
189
+ logits=logits,
190
+ sampling_metadata=sampling_metadata,
191
+ )
192
+ return output
193
+
194
+ @property
195
+ def vocab_size(self) -> int:
196
+ return self.model_config.get_vocab_size()
@@ -0,0 +1,98 @@
1
+ """A Neuron worker class."""
2
+ from typing import List, Tuple
3
+
4
+ import torch
5
+ import torch.distributed
6
+
7
+ from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
8
+ ParallelConfig, SchedulerConfig)
9
+ from vllm.model_executor import set_random_seed
10
+ from vllm.sequence import SamplerOutput, SequenceGroupMetadata
11
+ from vllm.worker.neuron_model_runner import NeuronModelRunner
12
+ from vllm.worker.worker_base import LoraNotSupportedWorkerBase
13
+
14
+
15
+ class NeuronWorker(LoraNotSupportedWorkerBase):
16
+ """A worker class that executes the model on a group of neuron cores.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ model_config: ModelConfig,
22
+ parallel_config: ParallelConfig,
23
+ scheduler_config: SchedulerConfig,
24
+ device_config: DeviceConfig,
25
+ cache_config: CacheConfig,
26
+ ) -> None:
27
+ self.model_config = model_config
28
+ self.parallel_config = parallel_config
29
+ self.scheduler_config = scheduler_config
30
+ self.device_config = device_config
31
+ self.cache_config = cache_config
32
+ if self.model_config.trust_remote_code:
33
+ # note: lazy import to avoid importing torch before initializing
34
+ from vllm.utils import init_cached_hf_modules
35
+ init_cached_hf_modules()
36
+
37
+ self.model_runner = NeuronModelRunner(model_config, parallel_config,
38
+ scheduler_config, device_config)
39
+
40
+ def init_device(self) -> None:
41
+ # Set random seed.
42
+ set_random_seed(self.model_config.seed)
43
+
44
+ def load_model(self):
45
+ self.model_runner.load_model()
46
+
47
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
48
+ """Determine the number of available KV blocks.
49
+
50
+ Swapping is not yet supported, so always return num_cpu_blocks=0.
51
+
52
+ We configure num_gpu_blocks to be equal to max_num_seqs.
53
+ """
54
+ # Set the number of GPU blocks to be the same as the maximum number of
55
+ # sequences that can be processed in a single batch. This is equivalent
56
+ # to schedule without PagedAttention.
57
+ num_gpu_blocks = self.scheduler_config.max_num_seqs
58
+
59
+ # Swap not yet supported with Neuron backend.
60
+ num_cpu_blocks = 0
61
+
62
+ return num_gpu_blocks, num_cpu_blocks
63
+
64
+ def initialize_cache(self, num_gpu_blocks: int,
65
+ num_cpu_blocks: int) -> None:
66
+ """Initialize the KV cache.
67
+ """
68
+
69
+ # Different values are not tested.
70
+ assert num_cpu_blocks == 0
71
+ assert num_gpu_blocks == self.scheduler_config.max_num_seqs
72
+
73
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
74
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
75
+
76
+ @torch.inference_mode()
77
+ def execute_model(
78
+ self,
79
+ seq_group_metadata_list: List[SequenceGroupMetadata],
80
+ ) -> List[SamplerOutput]:
81
+ num_seq_groups = len(seq_group_metadata_list)
82
+
83
+ # If there is no input, we don't need to execute the model.
84
+ if num_seq_groups == 0:
85
+ return []
86
+
87
+ output = self.model_runner.execute_model(seq_group_metadata_list)
88
+
89
+ # Neuron worker only supports single-step output. Wrap the output in a
90
+ # list to conform to interface.
91
+ return [output]
92
+
93
+ def get_cache_block_size_bytes(self) -> int:
94
+ """Determine the size in bytes of a cache block.
95
+
96
+ This is required for speculative decoding; it is not yet implemented.
97
+ """
98
+ raise NotImplementedError
vllm/worker/worker.py ADDED
@@ -0,0 +1,345 @@
1
+ """A GPU worker class."""
2
+ import gc
3
+ import os
4
+ from typing import Any, Dict, List, Optional, Set, Tuple
5
+
6
+ import torch
7
+ import torch.distributed
8
+
9
+ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
10
+ ModelConfig, ParallelConfig, SchedulerConfig,
11
+ VisionLanguageConfig)
12
+ from vllm.distributed import (broadcast_tensor_dict,
13
+ ensure_model_parallel_initialized,
14
+ get_tensor_model_parallel_cpu_group,
15
+ init_distributed_environment)
16
+ from vllm.distributed.device_communicators import pynccl_utils
17
+ from vllm.distributed.device_communicators.custom_all_reduce import (
18
+ init_custom_ar)
19
+ from vllm.lora.request import LoRARequest
20
+ from vllm.model_executor import set_random_seed
21
+ from vllm.sequence import ExecuteModelRequest, SamplerOutput
22
+ from vllm.worker.cache_engine import CacheEngine
23
+ from vllm.worker.model_runner import ModelRunner
24
+ from vllm.worker.worker_base import WorkerBase
25
+
26
+
27
+ class Worker(WorkerBase):
28
+ """A worker class that executes (a partition of) the model on a GPU.
29
+
30
+ Each worker is associated with a single GPU. The worker is responsible for
31
+ maintaining the KV cache and executing the model on the GPU. In case of
32
+ distributed inference, each worker is assigned a partition of the model.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ model_config: ModelConfig,
38
+ parallel_config: ParallelConfig,
39
+ scheduler_config: SchedulerConfig,
40
+ device_config: DeviceConfig,
41
+ cache_config: CacheConfig,
42
+ load_config: LoadConfig,
43
+ local_rank: int,
44
+ rank: int,
45
+ distributed_init_method: str,
46
+ lora_config: Optional[LoRAConfig] = None,
47
+ vision_language_config: Optional[VisionLanguageConfig] = None,
48
+ is_driver_worker: bool = False,
49
+ ) -> None:
50
+ self.model_config = model_config
51
+ self.parallel_config = parallel_config
52
+ self.scheduler_config = scheduler_config
53
+ self.device_config = device_config
54
+ self.cache_config = cache_config
55
+ self.local_rank = local_rank
56
+ self.rank = rank
57
+ self.distributed_init_method = distributed_init_method
58
+ self.lora_config = lora_config
59
+ self.load_config = load_config
60
+ self.is_driver_worker = is_driver_worker
61
+ if self.is_driver_worker:
62
+ assert self.rank == 0, "The driver worker must have rank 0."
63
+
64
+ if self.model_config.trust_remote_code:
65
+ # note: lazy import to avoid importing torch before initializing
66
+ from vllm.utils import init_cached_hf_modules
67
+ init_cached_hf_modules()
68
+ self.vision_language_config = vision_language_config
69
+ if self.vision_language_config:
70
+ assert not self.lora_config, (
71
+ "To be tested: vision language model with LoRA settings.")
72
+
73
+ self.model_runner = ModelRunner(
74
+ model_config,
75
+ parallel_config,
76
+ scheduler_config,
77
+ device_config,
78
+ load_config=load_config,
79
+ lora_config=self.lora_config,
80
+ kv_cache_dtype=self.cache_config.cache_dtype,
81
+ is_driver_worker=is_driver_worker,
82
+ vision_language_config=vision_language_config,
83
+ )
84
+ # Uninitialized cache engine. Will be initialized by
85
+ # initialize_cache.
86
+ self.cache_engine: CacheEngine
87
+ self.gpu_cache: List[torch.Tensor]
88
+
89
+ def init_device(self) -> None:
90
+ if self.device_config.device.type == "cuda":
91
+ # torch.distributed.all_reduce does not free the input tensor until
92
+ # the synchronization point. This causes the memory usage to grow
93
+ # as the number of all_reduce calls increases. This env var disables
94
+ # this behavior.
95
+ # Related issue:
96
+ # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
97
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
98
+
99
+ # This env var set by Ray causes exceptions with graph building.
100
+ os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
101
+ self.device = torch.device(f"cuda:{self.local_rank}")
102
+ torch.cuda.set_device(self.device)
103
+
104
+ _check_if_gpu_supports_dtype(self.model_config.dtype)
105
+ torch.cuda.empty_cache()
106
+ self.init_gpu_memory = torch.cuda.mem_get_info()[0]
107
+ else:
108
+ raise RuntimeError(
109
+ f"Not support device type: {self.device_config.device}")
110
+ # Initialize the distributed environment.
111
+ init_worker_distributed_environment(self.parallel_config, self.rank,
112
+ self.distributed_init_method,
113
+ self.local_rank)
114
+ # Set random seed.
115
+ set_random_seed(self.model_config.seed)
116
+
117
+ def load_model(self):
118
+ self.model_runner.load_model()
119
+
120
+ @torch.inference_mode()
121
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
122
+ """Profiles the peak memory usage of the model to determine how many
123
+ KV blocks may be allocated without OOMs.
124
+
125
+ The engine will first conduct a profiling of the existing memory usage.
126
+ Then, it calculate the maximum possible number of GPU and CPU blocks
127
+ that can be allocated with the remaining free memory.
128
+
129
+ .. tip::
130
+ You may limit the usage of GPU memory
131
+ by adjusting the `gpu_memory_utilization` parameter.
132
+ """
133
+ # Profile the memory usage of the model and get the maximum number of
134
+ # cache blocks that can be allocated with the remaining free memory.
135
+ torch.cuda.empty_cache()
136
+
137
+ # Execute a forward pass with dummy inputs to profile the memory usage
138
+ # of the model.
139
+ self.model_runner.profile_run()
140
+
141
+ # Calculate the number of blocks that can be allocated with the
142
+ # profiled peak memory.
143
+ torch.cuda.synchronize()
144
+ free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
145
+ # NOTE(woosuk): Here we assume that the other processes using the same
146
+ # GPU did not change their memory usage during the profiling.
147
+ peak_memory = self.init_gpu_memory - free_gpu_memory
148
+ assert peak_memory > 0, (
149
+ "Error in memory profiling. This happens when the GPU memory was "
150
+ "not properly cleaned up before initializing the vLLM instance.")
151
+
152
+ cache_block_size = self.get_cache_block_size_bytes()
153
+ num_gpu_blocks = int(
154
+ (total_gpu_memory * self.cache_config.gpu_memory_utilization -
155
+ peak_memory) // cache_block_size)
156
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes //
157
+ cache_block_size)
158
+ num_gpu_blocks = max(num_gpu_blocks, 0)
159
+ num_cpu_blocks = max(num_cpu_blocks, 0)
160
+ if self.model_runner.lora_manager:
161
+ self.model_runner.remove_all_loras()
162
+ gc.collect()
163
+ torch.cuda.empty_cache()
164
+ return num_gpu_blocks, num_cpu_blocks
165
+
166
+ def initialize_cache(self, num_gpu_blocks: int,
167
+ num_cpu_blocks: int) -> None:
168
+ """Allocate GPU and CPU KV cache with the specified number of blocks.
169
+
170
+ This also warms up the model, which may record CUDA graphs.
171
+ """
172
+ raise_if_cache_size_invalid(num_gpu_blocks,
173
+ self.cache_config.block_size,
174
+ self.model_config.max_model_len)
175
+
176
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
177
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
178
+
179
+ self._init_cache_engine()
180
+ self._warm_up_model()
181
+
182
+ def _init_cache_engine(self):
183
+ assert self.cache_config.num_gpu_blocks is not None
184
+ self.cache_engine = CacheEngine(self.cache_config, self.model_config,
185
+ self.parallel_config)
186
+ self.gpu_cache = self.cache_engine.gpu_cache
187
+ self.model_runner.set_block_size(self.cache_engine.block_size)
188
+
189
+ def _warm_up_model(self) -> None:
190
+ if not self.model_config.enforce_eager:
191
+ self.model_runner.capture_model(self.gpu_cache)
192
+ # Reset the seed to ensure that the random state is not affected by
193
+ # the model initialization and profiling.
194
+ set_random_seed(self.model_config.seed)
195
+
196
+ def cache_swap(
197
+ self,
198
+ blocks_to_swap_in: Dict[int, int],
199
+ blocks_to_swap_out: Dict[int, int],
200
+ blocks_to_copy: Dict[int, List[int]],
201
+ ) -> None:
202
+ # Issue cache operations.
203
+ # TODO(woosuk): Profile swapping overhead and optimize if needed.
204
+ if blocks_to_swap_in:
205
+ self.cache_engine.swap_in(blocks_to_swap_in)
206
+ if blocks_to_swap_out:
207
+ self.cache_engine.swap_out(blocks_to_swap_out)
208
+ if blocks_to_copy:
209
+ self.cache_engine.copy(blocks_to_copy)
210
+
211
+ @torch.inference_mode()
212
+ def execute_model(
213
+ self,
214
+ execute_model_req: Optional[ExecuteModelRequest] = None
215
+ ) -> List[SamplerOutput]:
216
+
217
+ if execute_model_req is None:
218
+ seq_group_metadata_list = None
219
+ else:
220
+ seq_group_metadata_list = execute_model_req.seq_group_metadata_list
221
+
222
+ if self.is_driver_worker:
223
+ assert seq_group_metadata_list is not None
224
+ assert execute_model_req is not None
225
+ num_seq_groups = len(seq_group_metadata_list)
226
+ blocks_to_swap_in = execute_model_req.blocks_to_swap_in
227
+ blocks_to_swap_out = execute_model_req.blocks_to_swap_out
228
+ blocks_to_copy = execute_model_req.blocks_to_copy
229
+ data: Dict[str, Any] = {
230
+ "num_seq_groups": num_seq_groups,
231
+ "blocks_to_swap_in": blocks_to_swap_in,
232
+ "blocks_to_swap_out": blocks_to_swap_out,
233
+ "blocks_to_copy": blocks_to_copy,
234
+ }
235
+ broadcast_tensor_dict(data, src=0)
236
+ else:
237
+ data = broadcast_tensor_dict(src=0)
238
+ num_seq_groups = data["num_seq_groups"]
239
+ blocks_to_swap_in = data["blocks_to_swap_in"]
240
+ blocks_to_swap_out = data["blocks_to_swap_out"]
241
+ blocks_to_copy = data["blocks_to_copy"]
242
+
243
+ self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
244
+
245
+ # If there is no input, we don't need to execute the model.
246
+ if num_seq_groups == 0:
247
+ return []
248
+
249
+ output = self.model_runner.execute_model(seq_group_metadata_list,
250
+ self.gpu_cache)
251
+
252
+ # Worker only supports single-step execution. Wrap the output in a list
253
+ # to conform to interface.
254
+ return [output]
255
+
256
+ def add_lora(self, lora_request: LoRARequest) -> bool:
257
+ return self.model_runner.add_lora(lora_request)
258
+
259
+ def remove_lora(self, lora_id: int) -> bool:
260
+ return self.model_runner.remove_lora(lora_id)
261
+
262
+ def list_loras(self) -> Set[int]:
263
+ return self.model_runner.list_loras()
264
+
265
+ @property
266
+ def max_model_len(self) -> int:
267
+ return self.model_config.max_model_len
268
+
269
+ @property
270
+ def vocab_size(self) -> int:
271
+ return self.model_runner.vocab_size
272
+
273
+ def get_cache_block_size_bytes(self) -> int:
274
+ """Get the size of the KV cache block size in bytes.
275
+ """
276
+ return CacheEngine.get_cache_block_size(self.cache_config,
277
+ self.model_config,
278
+ self.parallel_config)
279
+
280
+
281
+ def init_worker_distributed_environment(
282
+ parallel_config: ParallelConfig,
283
+ rank: int,
284
+ distributed_init_method: Optional[str] = None,
285
+ local_rank: int = -1,
286
+ ) -> None:
287
+ """Initialize the distributed environment."""
288
+ init_distributed_environment(parallel_config.world_size, rank,
289
+ distributed_init_method, local_rank)
290
+
291
+ ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
292
+ parallel_config.pipeline_parallel_size)
293
+
294
+ if pynccl_utils.is_initialized():
295
+ pynccl_world_size = pynccl_utils.get_world_size()
296
+ if pynccl_world_size != parallel_config.world_size:
297
+ raise RuntimeError(
298
+ "pynccl is already initialized but the pynccl world "
299
+ "size does not match parallel_config.world_size "
300
+ f"({pynccl_world_size} vs. {parallel_config.world_size}).")
301
+ elif parallel_config.world_size > 1:
302
+ # NOTE(woosuk): We don't initialize pynccl process group when world size
303
+ # is 1.
304
+ # NOTE(kaichao): By default, pynccl is initialized for tp group.
305
+ pynccl_utils.init_process_group(
306
+ group=get_tensor_model_parallel_cpu_group())
307
+
308
+ # Initialize a custom fast all-reduce implementation.
309
+ if not parallel_config.disable_custom_all_reduce:
310
+ init_custom_ar()
311
+
312
+ # A small all_reduce for warmup.
313
+ torch.distributed.all_reduce(torch.zeros(1).cuda())
314
+ if pynccl_utils.is_initialized():
315
+ pynccl_utils.all_reduce(torch.zeros(1).cuda())
316
+
317
+
318
+ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
319
+ # Check if the GPU supports the dtype.
320
+ if torch_dtype == torch.bfloat16:
321
+ compute_capability = torch.cuda.get_device_capability()
322
+ if compute_capability[0] < 8:
323
+ gpu_name = torch.cuda.get_device_name()
324
+ raise ValueError(
325
+ "Bfloat16 is only supported on GPUs with compute capability "
326
+ f"of at least 8.0. Your {gpu_name} GPU has compute capability "
327
+ f"{compute_capability[0]}.{compute_capability[1]}. "
328
+ "You can use float16 instead by explicitly setting the"
329
+ "`dtype` flag in CLI, for example: --dtype=half.")
330
+
331
+
332
+ def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
333
+ max_model_len) -> None:
334
+ if num_gpu_blocks <= 0:
335
+ raise ValueError("No available memory for the cache blocks. "
336
+ "Try increasing `gpu_memory_utilization` when "
337
+ "initializing the engine.")
338
+ max_seq_len = block_size * num_gpu_blocks
339
+ if max_model_len > max_seq_len:
340
+ raise ValueError(
341
+ f"The model's max seq len ({max_model_len}) "
342
+ "is larger than the maximum number of tokens that can be "
343
+ f"stored in KV cache ({max_seq_len}). Try increasing "
344
+ "`gpu_memory_utilization` or decreasing `max_model_len` when "
345
+ "initializing the engine.")