vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3 @@
1
+ from .communication_op import *
2
+ from .parallel_state import *
3
+ from .utils import *
@@ -0,0 +1,237 @@
1
+ from collections import namedtuple
2
+ from typing import Any, Dict, List, Optional, Tuple, Union
3
+
4
+ import torch
5
+ from torch.distributed import ProcessGroup
6
+
7
+ from .parallel_state import (get_cpu_world_group,
8
+ get_tensor_model_parallel_group,
9
+ get_tensor_model_parallel_rank,
10
+ get_tensor_model_parallel_world_size,
11
+ is_pynccl_enabled_for_all_reduce)
12
+
13
+
14
+ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
15
+ """All-reduce the input tensor across model parallel group.
16
+
17
+ NOTE: This operation will be applied in-place on the input tensor if
18
+ disable_custom_all_reduce is set to True. Otherwise, this operation may or
19
+ may not be applied in place depending on whether custom all reduce is
20
+ invoked for a particular tensor, which further depends on the tensor size
21
+ and GPU topology.
22
+
23
+ TLDR: always assume this function modifies its input, but use the return
24
+ value as the output.
25
+ """
26
+ from vllm.distributed.device_communicators import pynccl_utils
27
+ from vllm.distributed.device_communicators.custom_all_reduce import (
28
+ custom_all_reduce)
29
+
30
+ # Bypass the function if we are using only 1 GPU.
31
+ if get_tensor_model_parallel_world_size() == 1:
32
+ return input_
33
+ out = custom_all_reduce(input_)
34
+ if out is not None:
35
+ return out
36
+ if is_pynccl_enabled_for_all_reduce():
37
+ pynccl_utils.all_reduce(input_)
38
+ else:
39
+ torch.distributed.all_reduce(input_,
40
+ group=get_tensor_model_parallel_group())
41
+ return input_
42
+
43
+
44
+ def tensor_model_parallel_all_gather(input_: torch.Tensor,
45
+ dim: int = -1) -> torch.Tensor:
46
+ """All-gather the input tensor across model parallel group."""
47
+ world_size = get_tensor_model_parallel_world_size()
48
+ # Bypass the function if we are using only 1 GPU.
49
+ if world_size == 1:
50
+ return input_
51
+ assert -input_.dim() <= dim < input_.dim(), (
52
+ f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
53
+ if dim < 0:
54
+ # Convert negative dim to positive.
55
+ dim += input_.dim()
56
+ input_size = input_.size()
57
+ # Allocate output tensor.
58
+ output_tensor = torch.empty((world_size, ) + input_size,
59
+ dtype=input_.dtype,
60
+ device=input_.device)
61
+ # All-gather.
62
+ torch.distributed.all_gather_into_tensor(
63
+ output_tensor, input_, group=get_tensor_model_parallel_group())
64
+ # Reshape
65
+ output_tensor = output_tensor.movedim(0, dim)
66
+ output_tensor = output_tensor.reshape(input_size[:dim] +
67
+ (world_size * input_size[dim], ) +
68
+ input_size[dim + 1:])
69
+ return output_tensor
70
+
71
+
72
+ def tensor_model_parallel_gather(input_: torch.Tensor,
73
+ dst: int = 0,
74
+ dim: int = -1) -> torch.Tensor:
75
+ """Gather the input tensor across model parallel group.
76
+
77
+ NOTE: We assume that the input tensor is on the same device across
78
+ all the ranks.
79
+ """
80
+ world_size = get_tensor_model_parallel_world_size()
81
+ # Bypass the function if we are using only 1 GPU.
82
+ if world_size == 1:
83
+ return input_
84
+ assert -input_.dim() <= dim < input_.dim(), (
85
+ f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
86
+ if dim < 0:
87
+ # Convert negative dim to positive.
88
+ dim += input_.dim()
89
+ # Allocate output tensor.
90
+ if get_tensor_model_parallel_rank() == dst:
91
+ gather_list = [torch.empty_like(input_) for _ in range(world_size)]
92
+ else:
93
+ gather_list = None
94
+ # Gather.
95
+ torch.distributed.gather(input_,
96
+ gather_list,
97
+ dst=dst,
98
+ group=get_tensor_model_parallel_group())
99
+ if get_tensor_model_parallel_rank() == dst:
100
+ output_tensor = torch.cat(gather_list, dim=dim)
101
+ else:
102
+ output_tensor = None
103
+ return output_tensor
104
+
105
+
106
+ def broadcast(input_: torch.Tensor,
107
+ src: int = 0,
108
+ group: Optional[ProcessGroup] = None):
109
+ """Broadcast the input tensor."""
110
+ group = group or torch.distributed.group.WORLD
111
+ ranks = torch.distributed.get_process_group_ranks(group)
112
+ assert src in ranks, f"Invalid src rank ({src})"
113
+
114
+ # Bypass the function if we are using only 1 GPU.
115
+ world_size = torch.distributed.get_world_size(group=group)
116
+ if world_size == 1:
117
+ return input_
118
+ # Broadcast.
119
+ torch.distributed.broadcast(input_, src=src, group=group)
120
+ return input_
121
+
122
+
123
+ def broadcast_object_list(obj_list: List[Any],
124
+ src: int = 0,
125
+ group: Optional[ProcessGroup] = None):
126
+ """Broadcast the input object list."""
127
+ group = group or torch.distributed.group.WORLD
128
+ ranks = torch.distributed.get_process_group_ranks(group)
129
+ assert src in ranks, f"Invalid src rank ({src})"
130
+
131
+ # Bypass the function if we are using only 1 GPU.
132
+ world_size = torch.distributed.get_world_size(group=group)
133
+ if world_size == 1:
134
+ return obj_list
135
+ # Broadcast.
136
+ torch.distributed.broadcast_object_list(obj_list, src=src, group=group)
137
+ return obj_list
138
+
139
+
140
+ TensorMetadata = namedtuple("TensorMetadata", ["dtype", "size"])
141
+
142
+
143
+ def _split_tensor_dict(
144
+ tensor_dict: Dict[Any, Union[torch.Tensor, Any]]
145
+ ) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
146
+ """Split the tensor dictionary into two parts:
147
+ 1. A list of (key, value) pairs. If the value is a tensor, it is replaced
148
+ by its metadata.
149
+ 2. A list of tensors.
150
+ """
151
+ metadata_list = []
152
+ tensor_list = []
153
+ for key, value in tensor_dict.items():
154
+ if isinstance(value, torch.Tensor):
155
+ # Note(youkaichao): currently this only supports broadcasting
156
+ # tensors on cuda. In the future, we can add device as a field in
157
+ # TensorMetadata to support broadcasting tensors on different
158
+ # devices.
159
+ assert value.is_cuda, (
160
+ f"Tensor {key}: {value} is not on cuda. Currently we only "
161
+ f"support broadcasting tensors on cuda.")
162
+ metadata_list.append((key, TensorMetadata(value.dtype,
163
+ value.size())))
164
+ tensor_list.append(value)
165
+ else:
166
+ metadata_list.append((key, value))
167
+ return metadata_list, tensor_list
168
+
169
+
170
+ def broadcast_tensor_dict(
171
+ tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
172
+ src: int = 0,
173
+ group: Optional[ProcessGroup] = None,
174
+ metadata_group: Optional[ProcessGroup] = None
175
+ ) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
176
+ """Broadcast the input tensor dictionary.
177
+ `group` is used to broadcast the tensors, while `metadata_group` is used
178
+ to broadcast the metadata of the dict (e.g. dict structure, tensor sizes,
179
+ dtypes).
180
+ """
181
+ group = group or torch.distributed.group.WORLD
182
+ metadata_group = metadata_group or get_cpu_world_group()
183
+ ranks = torch.distributed.get_process_group_ranks(group)
184
+ assert src in ranks, f"Invalid src rank ({src})"
185
+
186
+ # Bypass the function if we are using only 1 GPU.
187
+ world_size = torch.distributed.get_world_size(group=group)
188
+ if world_size == 1:
189
+ return tensor_dict
190
+
191
+ rank = torch.distributed.get_rank()
192
+ if rank == src:
193
+ metadata_list: List[Tuple[Any, Any]] = []
194
+ assert isinstance(
195
+ tensor_dict,
196
+ dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
197
+ metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
198
+ # `metadata_list` lives in CPU memory.
199
+ # `broadcast_object_list` involves serialization and deserialization,
200
+ # all happening on CPU. Therefore, we can use the CPU group.
201
+ torch.distributed.broadcast_object_list([metadata_list],
202
+ src=src,
203
+ group=metadata_group)
204
+ async_handles = []
205
+ for tensor in tensor_list:
206
+ async_handles.append(
207
+ torch.distributed.broadcast(tensor,
208
+ src=src,
209
+ group=group,
210
+ async_op=True))
211
+ for async_handle in async_handles:
212
+ async_handle.wait()
213
+
214
+ else:
215
+ recv_metadata_list = [None]
216
+ torch.distributed.broadcast_object_list(recv_metadata_list,
217
+ src=src,
218
+ group=metadata_group)
219
+ assert recv_metadata_list[0] is not None
220
+ tensor_dict = {}
221
+ async_handles = []
222
+ for key, value in recv_metadata_list[0]:
223
+ if isinstance(value, TensorMetadata):
224
+ tensor = torch.empty(value.size,
225
+ dtype=value.dtype,
226
+ device="cuda")
227
+ async_handle = torch.distributed.broadcast(tensor,
228
+ src=src,
229
+ async_op=True,
230
+ group=group)
231
+ async_handles.append(async_handle)
232
+ tensor_dict[key] = tensor
233
+ else:
234
+ tensor_dict[key] = value
235
+ for async_handle in async_handles:
236
+ async_handle.wait()
237
+ return tensor_dict
File without changes
@@ -0,0 +1,274 @@
1
+ from contextlib import contextmanager
2
+ from typing import Any, List, Optional
3
+
4
+ import torch
5
+ import torch.distributed as dist
6
+
7
+ import vllm.envs as envs
8
+ from vllm.logger import init_logger
9
+
10
+ try:
11
+ import pynvml
12
+
13
+ from vllm._C import custom_ar
14
+ except ImportError:
15
+ # For AMD GPUs
16
+ custom_ar = None
17
+ pynvml = None
18
+
19
+ logger = init_logger(__name__)
20
+
21
+ _CA_HANDLE: Optional["CustomAllreduce"] = None
22
+ _IS_CAPTURING = False
23
+ _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
24
+
25
+
26
+ def init_custom_ar() -> None:
27
+ from vllm.distributed import (get_tensor_model_parallel_rank,
28
+ get_tensor_model_parallel_world_size)
29
+
30
+ global _CA_HANDLE
31
+ if _CA_HANDLE is not None:
32
+ return
33
+ rank = get_tensor_model_parallel_rank()
34
+ world_size = get_tensor_model_parallel_world_size()
35
+ if world_size == 1:
36
+ # No need to initialize custom allreduce for single GPU case.
37
+ return
38
+
39
+ if world_size not in _SUPPORTED_WORLD_SIZES:
40
+ logger.warning(
41
+ "Custom allreduce is disabled due to an unsupported world size: "
42
+ "%d. Supported world sizes: %s. To silence this warning, specify"
43
+ " disable_custom_all_reduce=True explicitly.", world_size,
44
+ str(_SUPPORTED_WORLD_SIZES))
45
+ return
46
+ num_dev = torch.cuda.device_count()
47
+ # note: num dev can be larger than world_size if we're only using
48
+ # first few GPUs
49
+ if num_dev < world_size:
50
+ logger.warning(
51
+ "Cannot test GPU P2P because not all GPUs are visible to the "
52
+ "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
53
+ " is set.")
54
+ return
55
+ # test nvlink first, this will filter out most of the cases
56
+ # where custom allreduce is not supported
57
+ cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
58
+ if cuda_visible_devices:
59
+ device_ids = list(map(int, cuda_visible_devices.split(",")))
60
+ else:
61
+ device_ids = list(range(num_dev))
62
+ # this checks hardware and driver support for NVLink
63
+ full_nvlink = _is_full_nvlink(device_ids)
64
+ if world_size > 2 and not full_nvlink:
65
+ logger.warning(
66
+ "Custom allreduce is disabled because it's not supported on more"
67
+ " than two PCIe-only GPUs. To silence this warning, specify"
68
+ " disable_custom_all_reduce=True explicitly.")
69
+ return
70
+ # test P2P capability, this checks software/cudaruntime support
71
+ # this is expensive to compute at the first time
72
+ # then we cache the result
73
+ if not _can_p2p(rank, world_size):
74
+ logger.warning(
75
+ "Custom allreduce is disabled because your platform lacks GPU P2P"
76
+ " capability or P2P test failed. To silence this warning, specify"
77
+ " disable_custom_all_reduce=True explicitly.")
78
+ return
79
+ _CA_HANDLE = CustomAllreduce(rank, world_size, full_nvlink)
80
+
81
+
82
+ def begin_capture() -> None:
83
+ global _IS_CAPTURING
84
+ _IS_CAPTURING = True
85
+
86
+
87
+ def end_capture() -> None:
88
+ global _IS_CAPTURING
89
+ _IS_CAPTURING = False
90
+
91
+
92
+ def is_capturing() -> bool:
93
+ return _IS_CAPTURING and _CA_HANDLE is not None
94
+
95
+
96
+ def get_handle() -> Optional["CustomAllreduce"]:
97
+ return _CA_HANDLE
98
+
99
+
100
+ def is_initialized() -> bool:
101
+ return _CA_HANDLE is not None
102
+
103
+
104
+ @contextmanager
105
+ def capture():
106
+ try:
107
+ begin_capture()
108
+ yield
109
+ finally:
110
+ end_capture()
111
+ handle = get_handle()
112
+ if handle is not None:
113
+ handle.register_graph_buffers()
114
+
115
+
116
+ def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
117
+ ca_handle = get_handle()
118
+ # when custom allreduce is disabled, this will be None
119
+ if ca_handle is None:
120
+ return None
121
+ if is_capturing():
122
+ if torch.cuda.is_current_stream_capturing():
123
+ if ca_handle.should_custom_ar(input):
124
+ return ca_handle.all_reduce_reg(input)
125
+ else:
126
+ if ca_handle.should_custom_ar(input):
127
+ # if warm up, mimic the allocation pattern
128
+ # since custom allreduce is out-of-place
129
+ return torch.empty_like(input)
130
+ else:
131
+ # note: outside of cuda graph context,
132
+ # custom allreduce incurs a cost of cudaMemcpy, which should
133
+ # be small(<=1% of overall latency) compared to the performance
134
+ # gains of using custom kernels
135
+ if ca_handle.should_custom_ar(input):
136
+ return ca_handle.all_reduce_unreg(input)
137
+
138
+ return None
139
+
140
+
141
+ @contextmanager
142
+ def _nvml():
143
+ try:
144
+ pynvml.nvmlInit()
145
+ yield
146
+ finally:
147
+ pynvml.nvmlShutdown()
148
+
149
+
150
+ @_nvml()
151
+ def _is_full_nvlink(device_ids: List[int]) -> bool:
152
+ """
153
+ query if the set of gpus are fully connected by nvlink (1 hop)
154
+ Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`,
155
+ so it works on real physical device ids.
156
+ """
157
+ handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
158
+ for i, handle in enumerate(handles):
159
+ for j, peer_handle in enumerate(handles):
160
+ if i < j:
161
+ try:
162
+ p2p_status = pynvml.nvmlDeviceGetP2PStatus(
163
+ handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
164
+ if p2p_status != pynvml.NVML_P2P_STATUS_OK:
165
+ return False
166
+ except pynvml.NVMLError as error:
167
+ logger.error(
168
+ "NVLink detection failed. This is normal if your"
169
+ " machine has no NVLink equipped.",
170
+ exc_info=error)
171
+ return False
172
+ return True
173
+
174
+
175
+ def _can_p2p(rank: int, world_size: int) -> bool:
176
+ from vllm.distributed.utils import gpu_p2p_access_check
177
+ for i in range(world_size):
178
+ if i == rank:
179
+ continue
180
+ if not gpu_p2p_access_check(rank, i):
181
+ return False
182
+ return True
183
+
184
+
185
+ class CustomAllreduce:
186
+
187
+ # max_size: max supported allreduce size
188
+ def __init__(self,
189
+ rank,
190
+ world_size,
191
+ full_nvlink,
192
+ max_size=8192 * 1024) -> None:
193
+ # buffers memory are owned by this Python class and passed to C++
194
+ # meta data composes of two parts: meta data for synchronization
195
+ # (256 bytes) and a temporary buffer for storing intermediate
196
+ # allreduce results.
197
+ self.meta = torch.zeros(custom_ar.meta_size() + max_size,
198
+ dtype=torch.uint8,
199
+ device="cuda")
200
+ # This is a pre-registered IPC buffer. In eager mode, input tensors
201
+ # are first copied into this buffer before allreduce is performed
202
+ self.buffer = torch.empty(max_size, dtype=torch.uint8, device="cuda")
203
+ # This is a buffer for storing the tuples of pointers pointing to
204
+ # IPC buffers from all ranks. Each registered tuple has size of
205
+ # 8*world_size bytes where world_size is at most 8. Allocating 8MB
206
+ # is enough for 131072 such tuples. The largest model I've seen only
207
+ # needs less than 10000 of registered tuples.
208
+ self.rank_data = torch.empty(8 * 1024 * 1024,
209
+ dtype=torch.uint8,
210
+ device="cuda")
211
+ self.max_size = max_size
212
+ self.world_size = world_size
213
+ handles, offsets = self._get_ipc_meta(self.meta)
214
+ self.full_nvlink = full_nvlink
215
+ self._ptr = custom_ar.init_custom_ar(self.meta, self.rank_data,
216
+ handles, offsets, rank,
217
+ self.full_nvlink)
218
+ self.register_buffer(self.buffer)
219
+
220
+ def _get_ipc_meta(self, inp: torch.Tensor):
221
+ data = inp.untyped_storage()._share_cuda_()
222
+ shard_data = (
223
+ data[1], # ipc handle to base ptr
224
+ data[3], # offset of base ptr
225
+ )
226
+ return self._gather_ipc_meta(shard_data)
227
+
228
+ def _gather_ipc_meta(self, shard_data):
229
+ all_data: List[Optional[Any]] = [None] * self.world_size
230
+ dist.all_gather_object(all_data, shard_data)
231
+
232
+ handles = []
233
+ offsets = []
234
+ for i in range(len(all_data)):
235
+ handles.append(all_data[i][0]) # type: ignore
236
+ offsets.append(all_data[i][1]) # type: ignore
237
+ return handles, offsets
238
+
239
+ def register_buffer(self, inp: torch.Tensor):
240
+ handles, offsets = self._get_ipc_meta(inp)
241
+ custom_ar.register_buffer(self._ptr, inp, handles, offsets)
242
+
243
+ def register_graph_buffers(self):
244
+ handle, offset = custom_ar.get_graph_buffer_ipc_meta(self._ptr)
245
+ handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
246
+ logger.info("Registering %d cuda graph addresses", len(offset))
247
+ custom_ar.register_graph_buffers(self._ptr, handles, offsets)
248
+
249
+ def should_custom_ar(self, inp: torch.Tensor):
250
+ return custom_ar.should_custom_ar(inp, self.max_size, self.world_size,
251
+ self.full_nvlink)
252
+
253
+ # all reduce, assuming inp tensor is IPC registered with register_buffer,
254
+ # or, in the context of cuda graphs, register_graph_buffers
255
+ def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
256
+ if out is None:
257
+ out = torch.empty_like(inp)
258
+ custom_ar.all_reduce_reg(self._ptr, inp, out)
259
+ return out
260
+
261
+ # all reduce, assuming inp tensor is NOT IPC registered
262
+ def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
263
+ if out is None:
264
+ out = torch.empty_like(inp)
265
+ custom_ar.all_reduce_unreg(self._ptr, inp, self.buffer, out)
266
+ return out
267
+
268
+ def close(self):
269
+ if self._ptr:
270
+ custom_ar.dispose(self._ptr)
271
+ self._ptr = 0
272
+
273
+ def __del__(self):
274
+ self.close()