sglang 0.3.6.post2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +55 -2
- sglang/bench_one_batch.py +7 -6
- sglang/bench_one_batch_server.py +4 -3
- sglang/bench_serving.py +13 -0
- sglang/check_env.py +1 -1
- sglang/launch_server.py +3 -2
- sglang/srt/_custom_ops.py +118 -0
- sglang/srt/configs/device_config.py +17 -0
- sglang/srt/configs/load_config.py +84 -0
- sglang/srt/configs/model_config.py +161 -4
- sglang/srt/configs/qwen2vl.py +5 -8
- sglang/srt/constrained/outlines_backend.py +6 -1
- sglang/srt/constrained/outlines_jump_forward.py +8 -1
- sglang/srt/distributed/__init__.py +3 -0
- sglang/srt/distributed/communication_op.py +34 -0
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
- sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
- sglang/srt/distributed/device_communicators/pynccl.py +204 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
- sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
- sglang/srt/distributed/parallel_state.py +1275 -0
- sglang/srt/distributed/utils.py +223 -0
- sglang/srt/hf_transformers_utils.py +37 -1
- sglang/srt/layers/attention/flashinfer_backend.py +13 -15
- sglang/srt/layers/attention/torch_native_backend.py +285 -0
- sglang/srt/layers/fused_moe_patch.py +20 -11
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/logits_processor.py +17 -3
- sglang/srt/layers/quantization/__init__.py +34 -0
- sglang/srt/layers/vocab_parallel_embedding.py +1 -0
- sglang/srt/lora/lora.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +7 -11
- sglang/srt/managers/detokenizer_manager.py +7 -4
- sglang/srt/managers/image_processor.py +1 -1
- sglang/srt/managers/io_struct.py +48 -12
- sglang/srt/managers/schedule_batch.py +42 -36
- sglang/srt/managers/schedule_policy.py +7 -4
- sglang/srt/managers/scheduler.py +111 -46
- sglang/srt/managers/session_controller.py +0 -3
- sglang/srt/managers/tokenizer_manager.py +169 -100
- sglang/srt/managers/tp_worker.py +36 -3
- sglang/srt/managers/tp_worker_overlap_thread.py +32 -5
- sglang/srt/model_executor/cuda_graph_runner.py +16 -7
- sglang/srt/model_executor/forward_batch_info.py +9 -4
- sglang/srt/model_executor/model_runner.py +136 -150
- sglang/srt/model_loader/__init__.py +34 -0
- sglang/srt/model_loader/loader.py +1139 -0
- sglang/srt/model_loader/utils.py +41 -0
- sglang/srt/model_loader/weight_utils.py +640 -0
- sglang/srt/models/baichuan.py +9 -10
- sglang/srt/models/chatglm.py +6 -15
- sglang/srt/models/commandr.py +2 -3
- sglang/srt/models/dbrx.py +2 -3
- sglang/srt/models/deepseek.py +4 -11
- sglang/srt/models/deepseek_v2.py +3 -11
- sglang/srt/models/exaone.py +2 -3
- sglang/srt/models/gemma.py +2 -6
- sglang/srt/models/gemma2.py +3 -14
- sglang/srt/models/gemma2_reward.py +0 -1
- sglang/srt/models/gpt2.py +5 -12
- sglang/srt/models/gpt_bigcode.py +6 -22
- sglang/srt/models/grok.py +14 -51
- sglang/srt/models/internlm2.py +2 -3
- sglang/srt/models/internlm2_reward.py +0 -1
- sglang/srt/models/llama.py +97 -27
- sglang/srt/models/llama_classification.py +1 -2
- sglang/srt/models/llama_embedding.py +1 -2
- sglang/srt/models/llama_reward.py +2 -3
- sglang/srt/models/llava.py +10 -12
- sglang/srt/models/llavavid.py +1 -2
- sglang/srt/models/minicpm.py +4 -7
- sglang/srt/models/minicpm3.py +6 -19
- sglang/srt/models/mixtral.py +12 -5
- sglang/srt/models/mixtral_quant.py +2 -3
- sglang/srt/models/mllama.py +3 -7
- sglang/srt/models/olmo.py +2 -8
- sglang/srt/models/olmo2.py +391 -0
- sglang/srt/models/olmoe.py +3 -5
- sglang/srt/models/phi3_small.py +8 -8
- sglang/srt/models/qwen.py +2 -3
- sglang/srt/models/qwen2.py +10 -9
- sglang/srt/models/qwen2_moe.py +4 -11
- sglang/srt/models/qwen2_vl.py +12 -9
- sglang/srt/models/registry.py +99 -0
- sglang/srt/models/stablelm.py +2 -3
- sglang/srt/models/torch_native_llama.py +6 -12
- sglang/srt/models/xverse.py +2 -4
- sglang/srt/models/xverse_moe.py +4 -11
- sglang/srt/models/yivl.py +2 -3
- sglang/srt/openai_api/adapter.py +10 -6
- sglang/srt/openai_api/protocol.py +1 -0
- sglang/srt/server.py +303 -204
- sglang/srt/server_args.py +65 -31
- sglang/srt/utils.py +253 -48
- sglang/test/test_utils.py +27 -7
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/METADATA +2 -1
- sglang-0.4.0.dist-info/RECORD +184 -0
- sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
- sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
- sglang/srt/layers/fused_moe_grok/layer.py +0 -630
- sglang-0.3.6.post2.dist-info/RECORD +0 -164
- {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
- {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
- {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,352 @@
|
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/custom_all_reduce.py
|
2
|
+
import ctypes
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
from contextlib import contextmanager
|
6
|
+
from functools import wraps
|
7
|
+
from typing import Callable, List, Optional, TypeVar, Union
|
8
|
+
|
9
|
+
import pynvml
|
10
|
+
import torch
|
11
|
+
import torch.distributed as dist
|
12
|
+
from torch.distributed import ProcessGroup
|
13
|
+
from typing_extensions import ParamSpec
|
14
|
+
|
15
|
+
from sglang.srt import _custom_ops as ops
|
16
|
+
from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
17
|
+
from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import (
|
18
|
+
gpu_p2p_access_check,
|
19
|
+
)
|
20
|
+
from sglang.srt.distributed.parallel_state import in_the_same_node_as
|
21
|
+
from sglang.srt.utils import cuda_device_count_stateless, is_cuda
|
22
|
+
|
23
|
+
try:
|
24
|
+
ops.meta_size()
|
25
|
+
custom_ar = True
|
26
|
+
except Exception:
|
27
|
+
# For AMD GPUs and CPUs
|
28
|
+
custom_ar = False
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
|
33
|
+
_P = ParamSpec("_P")
|
34
|
+
_R = TypeVar("_R")
|
35
|
+
|
36
|
+
|
37
|
+
def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
38
|
+
@wraps(fn)
|
39
|
+
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
|
40
|
+
pynvml.nvmlInit()
|
41
|
+
try:
|
42
|
+
return fn(*args, **kwargs)
|
43
|
+
finally:
|
44
|
+
pynvml.nvmlShutdown()
|
45
|
+
|
46
|
+
return wrapper
|
47
|
+
|
48
|
+
|
49
|
+
@with_nvml_context
|
50
|
+
def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
|
51
|
+
"""
|
52
|
+
query if the set of gpus are fully connected by nvlink (1 hop)
|
53
|
+
"""
|
54
|
+
handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids]
|
55
|
+
for i, handle in enumerate(handles):
|
56
|
+
for j, peer_handle in enumerate(handles):
|
57
|
+
if i < j:
|
58
|
+
try:
|
59
|
+
p2p_status = pynvml.nvmlDeviceGetP2PStatus(
|
60
|
+
handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK
|
61
|
+
)
|
62
|
+
if p2p_status != pynvml.NVML_P2P_STATUS_OK:
|
63
|
+
return False
|
64
|
+
except pynvml.NVMLError:
|
65
|
+
logger.exception(
|
66
|
+
"NVLink detection failed. This is normal if your"
|
67
|
+
" machine has no NVLink equipped."
|
68
|
+
)
|
69
|
+
return False
|
70
|
+
return True
|
71
|
+
|
72
|
+
|
73
|
+
def _can_p2p(rank: int, world_size: int) -> bool:
|
74
|
+
# SGLANG_SKIP_P2P_CHECK can be set to False in sglang
|
75
|
+
SGLANG_SKIP_P2P_CHECK = os.getenv("SGLANG_SKIP_P2P_CHECK", "0") == "1"
|
76
|
+
for i in range(world_size):
|
77
|
+
if i == rank:
|
78
|
+
continue
|
79
|
+
if SGLANG_SKIP_P2P_CHECK:
|
80
|
+
logger.info("Skipping P2P check and trusting the driver's P2P report.")
|
81
|
+
return torch.cuda.can_device_access_peer(rank, i)
|
82
|
+
if not gpu_p2p_access_check(rank, i):
|
83
|
+
return False
|
84
|
+
return True
|
85
|
+
|
86
|
+
|
87
|
+
def is_weak_contiguous(inp: torch.Tensor):
|
88
|
+
return inp.is_contiguous() or (
|
89
|
+
inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
|
90
|
+
== inp.numel() * inp.element_size()
|
91
|
+
)
|
92
|
+
|
93
|
+
|
94
|
+
class CustomAllreduce:
|
95
|
+
|
96
|
+
_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
|
97
|
+
|
98
|
+
# max_size: max supported allreduce size
|
99
|
+
def __init__(
|
100
|
+
self,
|
101
|
+
group: ProcessGroup,
|
102
|
+
device: Union[int, str, torch.device],
|
103
|
+
max_size=8192 * 1024,
|
104
|
+
) -> None:
|
105
|
+
"""
|
106
|
+
Args:
|
107
|
+
group: the process group to work on. If None, it will use the
|
108
|
+
default process group.
|
109
|
+
device: the device to bind the CustomAllreduce to. If None,
|
110
|
+
it will be bind to f"cuda:{local_rank}".
|
111
|
+
It is the caller's responsibility to make sure each communicator
|
112
|
+
is bind to a unique device, and all communicators in this group
|
113
|
+
are in the same node.
|
114
|
+
"""
|
115
|
+
self._IS_CAPTURING = False
|
116
|
+
self.disabled = True
|
117
|
+
|
118
|
+
if not custom_ar:
|
119
|
+
# disable because of missing custom allreduce library
|
120
|
+
# e.g. in a non-cuda environment
|
121
|
+
return
|
122
|
+
|
123
|
+
self.group = group
|
124
|
+
|
125
|
+
assert (
|
126
|
+
dist.get_backend(group) != dist.Backend.NCCL
|
127
|
+
), "CustomAllreduce should be attached to a non-NCCL group."
|
128
|
+
|
129
|
+
if not all(in_the_same_node_as(group, source_rank=0)):
|
130
|
+
# No need to initialize custom allreduce for multi-node case.
|
131
|
+
logger.warning(
|
132
|
+
"Custom allreduce is disabled because this process group"
|
133
|
+
" spans across nodes."
|
134
|
+
)
|
135
|
+
return
|
136
|
+
|
137
|
+
rank = dist.get_rank(group=self.group)
|
138
|
+
world_size = dist.get_world_size(group=self.group)
|
139
|
+
if world_size == 1:
|
140
|
+
# No need to initialize custom allreduce for single GPU case.
|
141
|
+
return
|
142
|
+
|
143
|
+
if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
|
144
|
+
logger.warning(
|
145
|
+
"Custom allreduce is disabled due to an unsupported world"
|
146
|
+
" size: %d. Supported world sizes: %s. To silence this "
|
147
|
+
"warning, specify disable_custom_all_reduce=True explicitly.",
|
148
|
+
world_size,
|
149
|
+
str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
|
150
|
+
)
|
151
|
+
return
|
152
|
+
|
153
|
+
if isinstance(device, int):
|
154
|
+
device = torch.device(f"cuda:{device}")
|
155
|
+
elif isinstance(device, str):
|
156
|
+
device = torch.device(device)
|
157
|
+
# now `device` is a `torch.device` object
|
158
|
+
assert isinstance(device, torch.device)
|
159
|
+
self.device = device
|
160
|
+
|
161
|
+
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
|
162
|
+
if cuda_visible_devices:
|
163
|
+
device_ids = list(map(int, cuda_visible_devices.split(",")))
|
164
|
+
else:
|
165
|
+
device_ids = list(range(cuda_device_count_stateless()))
|
166
|
+
|
167
|
+
physical_device_id = device_ids[device.index]
|
168
|
+
tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
|
169
|
+
gather_list = [
|
170
|
+
torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
|
171
|
+
]
|
172
|
+
dist.all_gather(gather_list, tensor, group=self.group)
|
173
|
+
physical_device_ids = [t.item() for t in gather_list]
|
174
|
+
|
175
|
+
# test nvlink first, this will filter out most of the cases
|
176
|
+
# where custom allreduce is not supported
|
177
|
+
# this checks hardware and driver support for NVLink
|
178
|
+
assert is_cuda()
|
179
|
+
|
180
|
+
full_nvlink = is_full_nvlink(physical_device_ids)
|
181
|
+
if world_size > 2 and not full_nvlink:
|
182
|
+
logger.warning(
|
183
|
+
"Custom allreduce is disabled because it's not supported on"
|
184
|
+
" more than two PCIe-only GPUs. To silence this warning, "
|
185
|
+
"specify disable_custom_all_reduce=True explicitly."
|
186
|
+
)
|
187
|
+
return
|
188
|
+
# test P2P capability, this checks software/cudaruntime support
|
189
|
+
# this is expensive to compute at the first time
|
190
|
+
# then we cache the result
|
191
|
+
if not _can_p2p(rank, world_size):
|
192
|
+
logger.warning(
|
193
|
+
"Custom allreduce is disabled because your platform lacks "
|
194
|
+
"GPU P2P capability or P2P test failed. To silence this "
|
195
|
+
"warning, specify disable_custom_all_reduce=True explicitly."
|
196
|
+
)
|
197
|
+
return
|
198
|
+
|
199
|
+
self.disabled = False
|
200
|
+
# Buffers memory are owned by this Python class and passed to C++.
|
201
|
+
# Meta data composes of two parts: meta data for synchronization and a
|
202
|
+
# temporary buffer for storing intermediate allreduce results.
|
203
|
+
self.meta_ptrs = self.create_shared_buffer(
|
204
|
+
ops.meta_size() + max_size, group=group
|
205
|
+
)
|
206
|
+
# This is a pre-registered IPC buffer. In eager mode, input tensors
|
207
|
+
# are first copied into this buffer before allreduce is performed
|
208
|
+
self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
|
209
|
+
# This is a buffer for storing the tuples of pointers pointing to
|
210
|
+
# IPC buffers from all ranks. Each registered tuple has size of
|
211
|
+
# 8*world_size bytes where world_size is at most 8. Allocating 8MB
|
212
|
+
# is enough for 131072 such tuples. The largest model I've seen only
|
213
|
+
# needs less than 10000 of registered tuples.
|
214
|
+
self.rank_data = torch.empty(
|
215
|
+
8 * 1024 * 1024, dtype=torch.uint8, device=self.device
|
216
|
+
)
|
217
|
+
self.max_size = max_size
|
218
|
+
self.rank = rank
|
219
|
+
self.world_size = world_size
|
220
|
+
self.full_nvlink = full_nvlink
|
221
|
+
self._ptr = ops.init_custom_ar(
|
222
|
+
self.meta_ptrs, self.rank_data, rank, self.full_nvlink
|
223
|
+
)
|
224
|
+
ops.register_buffer(self._ptr, self.buffer_ptrs)
|
225
|
+
|
226
|
+
@staticmethod
|
227
|
+
def create_shared_buffer(
|
228
|
+
size_in_bytes: int, group: Optional[ProcessGroup] = None
|
229
|
+
) -> List[int]:
|
230
|
+
"""
|
231
|
+
Creates a shared buffer and returns a list of pointers
|
232
|
+
representing the buffer on all processes in the group.
|
233
|
+
"""
|
234
|
+
lib = CudaRTLibrary()
|
235
|
+
pointer = lib.cudaMalloc(size_in_bytes)
|
236
|
+
handle = lib.cudaIpcGetMemHandle(pointer)
|
237
|
+
world_size = dist.get_world_size(group=group)
|
238
|
+
rank = dist.get_rank(group=group)
|
239
|
+
handles = [None] * world_size
|
240
|
+
dist.all_gather_object(handles, handle, group=group)
|
241
|
+
|
242
|
+
pointers: List[int] = []
|
243
|
+
for i, h in enumerate(handles):
|
244
|
+
if i == rank:
|
245
|
+
pointers.append(pointer.value) # type: ignore
|
246
|
+
else:
|
247
|
+
pointers.append(lib.cudaIpcOpenMemHandle(h).value) # type: ignore
|
248
|
+
|
249
|
+
return pointers
|
250
|
+
|
251
|
+
@staticmethod
|
252
|
+
def free_shared_buffer(
|
253
|
+
pointers: List[int], group: Optional[ProcessGroup] = None
|
254
|
+
) -> None:
|
255
|
+
rank = dist.get_rank(group=group)
|
256
|
+
lib = CudaRTLibrary()
|
257
|
+
lib.cudaFree(ctypes.c_void_p(pointers[rank]))
|
258
|
+
|
259
|
+
@contextmanager
|
260
|
+
def capture(self):
|
261
|
+
"""
|
262
|
+
The main responsibility of this context manager is the
|
263
|
+
`register_graph_buffers` call at the end of the context.
|
264
|
+
It records all the buffer addresses used in the CUDA graph.
|
265
|
+
"""
|
266
|
+
try:
|
267
|
+
self._IS_CAPTURING = True
|
268
|
+
yield
|
269
|
+
finally:
|
270
|
+
self._IS_CAPTURING = False
|
271
|
+
if not self.disabled:
|
272
|
+
self.register_graph_buffers()
|
273
|
+
|
274
|
+
def register_graph_buffers(self):
|
275
|
+
handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
|
276
|
+
logger.info("Registering %d cuda graph addresses", len(offset))
|
277
|
+
# We cannot directly use `dist.all_gather_object` here
|
278
|
+
# because it is incompatible with `gloo` backend under inference mode.
|
279
|
+
# see https://github.com/pytorch/pytorch/issues/126032 for details.
|
280
|
+
all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))]
|
281
|
+
all_data[self.rank] = [handle, offset]
|
282
|
+
ranks = sorted(dist.get_process_group_ranks(group=self.group))
|
283
|
+
for i, rank in enumerate(ranks):
|
284
|
+
dist.broadcast_object_list(
|
285
|
+
all_data[i], src=rank, group=self.group, device="cpu"
|
286
|
+
)
|
287
|
+
# Unpack list of tuples to tuple of lists.
|
288
|
+
handles = [d[0] for d in all_data] # type: ignore
|
289
|
+
offsets = [d[1] for d in all_data] # type: ignore
|
290
|
+
ops.register_graph_buffers(self._ptr, handles, offsets)
|
291
|
+
|
292
|
+
def should_custom_ar(self, inp: torch.Tensor):
|
293
|
+
if self.disabled:
|
294
|
+
return False
|
295
|
+
inp_size = inp.numel() * inp.element_size()
|
296
|
+
# custom allreduce requires input byte size to be multiples of 16
|
297
|
+
if inp_size % 16 != 0:
|
298
|
+
return False
|
299
|
+
if not is_weak_contiguous(inp):
|
300
|
+
return False
|
301
|
+
# for 4 or more non NVLink-capable GPUs, custom allreduce provides
|
302
|
+
# little performance improvement over NCCL.
|
303
|
+
if self.world_size == 2 or self.full_nvlink:
|
304
|
+
return inp_size < self.max_size
|
305
|
+
return False
|
306
|
+
|
307
|
+
def all_reduce(
|
308
|
+
self, inp: torch.Tensor, *, out: torch.Tensor = None, registered: bool = False
|
309
|
+
):
|
310
|
+
"""Performs an out-of-place all reduce.
|
311
|
+
|
312
|
+
If registered is True, this assumes inp's pointer is already
|
313
|
+
IPC-registered. Otherwise, inp is first copied into a pre-registered
|
314
|
+
buffer.
|
315
|
+
"""
|
316
|
+
if out is None:
|
317
|
+
out = torch.empty_like(inp)
|
318
|
+
if registered:
|
319
|
+
ops.all_reduce(self._ptr, inp, out, 0, 0)
|
320
|
+
else:
|
321
|
+
ops.all_reduce(
|
322
|
+
self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
|
323
|
+
)
|
324
|
+
return out
|
325
|
+
|
326
|
+
def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
|
327
|
+
"""The main allreduce API that provides support for cuda graph."""
|
328
|
+
# When custom allreduce is disabled, this will be None.
|
329
|
+
if self.disabled or not self.should_custom_ar(input):
|
330
|
+
return None
|
331
|
+
if self._IS_CAPTURING:
|
332
|
+
if torch.cuda.is_current_stream_capturing():
|
333
|
+
return self.all_reduce(input, registered=True)
|
334
|
+
else:
|
335
|
+
# If warm up, mimic the allocation pattern since custom
|
336
|
+
# allreduce is out-of-place.
|
337
|
+
return torch.empty_like(input)
|
338
|
+
else:
|
339
|
+
# Note: outside of cuda graph context, custom allreduce incurs a
|
340
|
+
# cost of cudaMemcpy, which should be small (<=1% of overall
|
341
|
+
# latency) compared to the performance gain of using custom kernels
|
342
|
+
return self.all_reduce(input, registered=False)
|
343
|
+
|
344
|
+
def close(self):
|
345
|
+
if not self.disabled and self._ptr:
|
346
|
+
ops.dispose(self._ptr)
|
347
|
+
self._ptr = 0
|
348
|
+
self.free_shared_buffer(self.meta_ptrs)
|
349
|
+
self.free_shared_buffer(self.buffer_ptrs)
|
350
|
+
|
351
|
+
def __del__(self):
|
352
|
+
self.close()
|
@@ -0,0 +1,291 @@
|
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/custom_all_reduce_utils.py
|
2
|
+
import ctypes
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import pickle
|
7
|
+
import subprocess
|
8
|
+
import sys
|
9
|
+
import tempfile
|
10
|
+
from functools import lru_cache
|
11
|
+
from itertools import product
|
12
|
+
from typing import Dict, List, Optional, Sequence
|
13
|
+
|
14
|
+
import torch.distributed as dist
|
15
|
+
import torch.multiprocessing as mp
|
16
|
+
|
17
|
+
from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
18
|
+
from sglang.srt.utils import cuda_device_count_stateless
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
def update_environment_variables(envs: Dict[str, str]):
|
24
|
+
for k, v in envs.items():
|
25
|
+
if k in os.environ and os.environ[k] != v:
|
26
|
+
logger.warning(
|
27
|
+
"Overwriting environment variable %s " "from '%s' to '%s'",
|
28
|
+
k,
|
29
|
+
os.environ[k],
|
30
|
+
v,
|
31
|
+
)
|
32
|
+
os.environ[k] = v
|
33
|
+
|
34
|
+
|
35
|
+
def producer(
|
36
|
+
batch_src: Sequence[int],
|
37
|
+
producer_queue,
|
38
|
+
consumer_queue,
|
39
|
+
result_queue,
|
40
|
+
cuda_visible_devices: Optional[str] = None,
|
41
|
+
):
|
42
|
+
if cuda_visible_devices is not None:
|
43
|
+
update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
|
44
|
+
|
45
|
+
lib = CudaRTLibrary()
|
46
|
+
for i in batch_src:
|
47
|
+
lib.cudaSetDevice(i)
|
48
|
+
pointer = lib.cudaMalloc(1024)
|
49
|
+
lib.cudaMemset(pointer, 1, 1024)
|
50
|
+
lib.cudaDeviceSynchronize()
|
51
|
+
handle = lib.cudaIpcGetMemHandle(pointer)
|
52
|
+
producer_queue.put(handle)
|
53
|
+
open_success = consumer_queue.get()
|
54
|
+
if open_success:
|
55
|
+
# use two queues to simulate barrier
|
56
|
+
producer_queue.put(0)
|
57
|
+
consumer_queue.get()
|
58
|
+
# check if the memory is modified
|
59
|
+
host_data = (ctypes.c_char * 1024)()
|
60
|
+
lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore
|
61
|
+
for i in range(1024):
|
62
|
+
if ord(host_data[i]) != 2:
|
63
|
+
open_success = False
|
64
|
+
break
|
65
|
+
result_queue.put(open_success)
|
66
|
+
lib.cudaDeviceReset()
|
67
|
+
|
68
|
+
|
69
|
+
def consumer(
|
70
|
+
batch_tgt: Sequence[int],
|
71
|
+
producer_queue,
|
72
|
+
consumer_queue,
|
73
|
+
result_queue,
|
74
|
+
cuda_visible_devices: Optional[str] = None,
|
75
|
+
):
|
76
|
+
if cuda_visible_devices is not None:
|
77
|
+
update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
|
78
|
+
|
79
|
+
lib = CudaRTLibrary()
|
80
|
+
for j in batch_tgt:
|
81
|
+
lib.cudaSetDevice(j)
|
82
|
+
handle = producer_queue.get()
|
83
|
+
open_success = False
|
84
|
+
try:
|
85
|
+
pointer = lib.cudaIpcOpenMemHandle(handle) # type: ignore
|
86
|
+
open_success = True
|
87
|
+
except RuntimeError:
|
88
|
+
# cannot error out here, because the producer process
|
89
|
+
# is still waiting for the response.
|
90
|
+
pass
|
91
|
+
consumer_queue.put(open_success)
|
92
|
+
if open_success:
|
93
|
+
# modify the memory
|
94
|
+
lib.cudaMemset(pointer, 2, 1024)
|
95
|
+
lib.cudaDeviceSynchronize()
|
96
|
+
# use two queues to simulate barrier
|
97
|
+
producer_queue.get()
|
98
|
+
consumer_queue.put(0)
|
99
|
+
# check if the memory is modified
|
100
|
+
host_data = (ctypes.c_char * 1024)()
|
101
|
+
lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore
|
102
|
+
for i in range(1024):
|
103
|
+
if ord(host_data[i]) != 2:
|
104
|
+
open_success = False
|
105
|
+
break
|
106
|
+
result_queue.put(open_success)
|
107
|
+
lib.cudaDeviceReset()
|
108
|
+
|
109
|
+
|
110
|
+
def can_actually_p2p(
|
111
|
+
batch_src: Sequence[int],
|
112
|
+
batch_tgt: Sequence[int],
|
113
|
+
) -> Sequence[bool]:
|
114
|
+
"""
|
115
|
+
Usually, checking if P2P access is enabled can be done by
|
116
|
+
`torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
|
117
|
+
the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
|
118
|
+
returns `True` even if P2P access is not actually possible.
|
119
|
+
See https://github.com/vllm-project/vllm/issues/2728 and
|
120
|
+
https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
|
121
|
+
Therefore, we have to perform a real P2P access to check if it is actually
|
122
|
+
possible.
|
123
|
+
|
124
|
+
Note on p2p and cuda IPC:
|
125
|
+
Usually, one process uses one GPU:
|
126
|
+
GPU src --> cuda context src --> tensor src --> process src
|
127
|
+
|
128
|
+
We need to combine p2p and cuda IPC, so that:
|
129
|
+
GPU src --> cuda context src --> tensor src --> process src
|
130
|
+
|shared|
|
131
|
+
GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
|
132
|
+
That is to say, process src creates a tensor in GPU src, passes IPC handle to
|
133
|
+
process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
|
134
|
+
tensor in process tgt will be reflected in the tensor in process src, because
|
135
|
+
they are the same memory segment.
|
136
|
+
It is important to note that process tgt accesses the tensor in GPU tgt, not
|
137
|
+
GPU src. That's why we need p2p access.
|
138
|
+
|
139
|
+
The most time-consuming part is the process creation. To avoid creating
|
140
|
+
processes for every pair of GPUs, we use batched testing. We create two
|
141
|
+
processes for testing all pairs of GPUs in batch. The trick is to reset
|
142
|
+
the device after each test (which is not available in PyTorch).
|
143
|
+
""" # noqa
|
144
|
+
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
|
145
|
+
# pass the CUDA_VISIBLE_DEVICES to the child process
|
146
|
+
# to make sure they see the same set of GPUs
|
147
|
+
|
148
|
+
# make sure the processes are spawned
|
149
|
+
smp = mp.get_context("spawn")
|
150
|
+
producer_queue = smp.Queue()
|
151
|
+
consumer_queue = smp.Queue()
|
152
|
+
result_queue = smp.Queue()
|
153
|
+
p_src = smp.Process(
|
154
|
+
target=producer,
|
155
|
+
args=(
|
156
|
+
batch_src,
|
157
|
+
producer_queue,
|
158
|
+
consumer_queue,
|
159
|
+
result_queue,
|
160
|
+
cuda_visible_devices,
|
161
|
+
),
|
162
|
+
)
|
163
|
+
p_tgt = smp.Process(
|
164
|
+
target=consumer,
|
165
|
+
args=(
|
166
|
+
batch_tgt,
|
167
|
+
producer_queue,
|
168
|
+
consumer_queue,
|
169
|
+
result_queue,
|
170
|
+
cuda_visible_devices,
|
171
|
+
),
|
172
|
+
)
|
173
|
+
p_src.start()
|
174
|
+
p_tgt.start()
|
175
|
+
p_src.join()
|
176
|
+
p_tgt.join()
|
177
|
+
assert p_src.exitcode == 0 and p_tgt.exitcode == 0
|
178
|
+
result: List[bool] = []
|
179
|
+
for src, tgt in zip(batch_src, batch_tgt):
|
180
|
+
a = result_queue.get()
|
181
|
+
b = result_queue.get()
|
182
|
+
if a != b:
|
183
|
+
logger.warning(
|
184
|
+
"Two processes do not agree on the P2P access"
|
185
|
+
" status on %d -> %d, treat as disabled.",
|
186
|
+
src,
|
187
|
+
tgt,
|
188
|
+
)
|
189
|
+
result.append(False)
|
190
|
+
else:
|
191
|
+
result.append(a)
|
192
|
+
return result
|
193
|
+
|
194
|
+
|
195
|
+
# why do we need this cache?
|
196
|
+
# we are testing peer-to-peer (p2p) access between GPUs,across processes.
|
197
|
+
# if we test it every time, it will be very slow, because we need to create
|
198
|
+
# N * N * 2 processes, where N is the world size. This is very slow.
|
199
|
+
# to reduce the time, we use a cache file to store the p2p access status.
|
200
|
+
# the cache file is generated by the master process if it does not exist.
|
201
|
+
# then all the processes can read the cache file to check the p2p access status.
|
202
|
+
# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
|
203
|
+
# can have different cache files for different CUDA_VISIBLE_DEVICES settings,
|
204
|
+
# e.g. used by different vllm engines. The device id in the cache file is a
|
205
|
+
# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
|
206
|
+
# of visible devices in the vllm engine.
|
207
|
+
_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
|
208
|
+
|
209
|
+
|
210
|
+
def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
211
|
+
"""Check if GPU src can access GPU tgt."""
|
212
|
+
|
213
|
+
# if the cache variable is already calculated,
|
214
|
+
# read from the cache instead of checking it again
|
215
|
+
global _gpu_p2p_access_cache
|
216
|
+
if _gpu_p2p_access_cache is not None:
|
217
|
+
return _gpu_p2p_access_cache[f"{src}->{tgt}"]
|
218
|
+
|
219
|
+
is_distributed = dist.is_initialized()
|
220
|
+
|
221
|
+
num_dev = cuda_device_count_stateless()
|
222
|
+
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
|
223
|
+
if cuda_visible_devices is None:
|
224
|
+
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
|
225
|
+
|
226
|
+
# VLLM_CACHE_ROOT -> SGLANG_CACHE_ROOT
|
227
|
+
# "~/.cache/vllm" -> "~/.cache/sglang"
|
228
|
+
SGLANG_CACHE_ROOT = os.path.expanduser("~/.cache/sglang")
|
229
|
+
path = os.path.join(
|
230
|
+
SGLANG_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
|
231
|
+
)
|
232
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
233
|
+
from sglang.srt.distributed.parallel_state import get_world_group
|
234
|
+
|
235
|
+
if (not is_distributed or get_world_group().local_rank == 0) and (
|
236
|
+
not os.path.exists(path)
|
237
|
+
):
|
238
|
+
# only the local master process (with local_rank == 0) can
|
239
|
+
# enter this block to calculate the cache
|
240
|
+
logger.info("generating GPU P2P access cache in %s", path)
|
241
|
+
cache: Dict[str, bool] = {}
|
242
|
+
ids = list(range(num_dev))
|
243
|
+
# batch of all pairs of GPUs
|
244
|
+
batch_src, batch_tgt = zip(*list(product(ids, ids)))
|
245
|
+
# NOTE: we use `subprocess` rather than `multiprocessing` here
|
246
|
+
# because the caller might not have `if __name__ == "__main__":`,
|
247
|
+
# in that case we cannot use spawn method in multiprocessing.
|
248
|
+
# However, `can_actually_p2p` requires spawn method.
|
249
|
+
# The fix is, we use `subprocess` to call the function,
|
250
|
+
# where we have `if __name__ == "__main__":` in this file.
|
251
|
+
|
252
|
+
# use a temporary file to store the result
|
253
|
+
# we don't use the output of the subprocess directly,
|
254
|
+
# because the subprocess might produce logging output
|
255
|
+
with tempfile.NamedTemporaryFile() as output_file:
|
256
|
+
input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
|
257
|
+
returned = subprocess.run(
|
258
|
+
[sys.executable, __file__], input=input_bytes, capture_output=True
|
259
|
+
)
|
260
|
+
# check if the subprocess is successful
|
261
|
+
try:
|
262
|
+
returned.check_returncode()
|
263
|
+
except Exception as e:
|
264
|
+
# wrap raised exception to provide more information
|
265
|
+
raise RuntimeError(
|
266
|
+
f"Error happened when batch testing "
|
267
|
+
f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
|
268
|
+
f"{returned.stderr.decode()}"
|
269
|
+
) from e
|
270
|
+
with open(output_file.name, "rb") as f:
|
271
|
+
result = pickle.load(f)
|
272
|
+
for _i, _j, r in zip(batch_src, batch_tgt, result):
|
273
|
+
cache[f"{_i}->{_j}"] = r
|
274
|
+
with open(path, "w") as f:
|
275
|
+
json.dump(cache, f, indent=4)
|
276
|
+
if is_distributed:
|
277
|
+
get_world_group().barrier()
|
278
|
+
logger.info("reading GPU P2P access cache from %s", path)
|
279
|
+
with open(path) as f:
|
280
|
+
cache = json.load(f)
|
281
|
+
_gpu_p2p_access_cache = cache
|
282
|
+
return _gpu_p2p_access_cache[f"{src}->{tgt}"]
|
283
|
+
|
284
|
+
|
285
|
+
__all__ = ["gpu_p2p_access_check"]
|
286
|
+
|
287
|
+
if __name__ == "__main__":
|
288
|
+
batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
|
289
|
+
result = can_actually_p2p(batch_src, batch_tgt)
|
290
|
+
with open(output_file, "wb") as f:
|
291
|
+
f.write(pickle.dumps(result))
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/hpu_communicator.py
|
2
|
+
import torch
|
3
|
+
import torch.distributed as dist
|
4
|
+
from torch.distributed import ProcessGroup
|
5
|
+
|
6
|
+
from sglang.srt.utils import is_hpu
|
7
|
+
|
8
|
+
if is_hpu():
|
9
|
+
import habana_frameworks.torch as htorch # noqa: F401
|
10
|
+
|
11
|
+
|
12
|
+
class HpuCommunicator:
|
13
|
+
|
14
|
+
def __init__(self, group: ProcessGroup):
|
15
|
+
if not is_hpu():
|
16
|
+
self.disabled = True
|
17
|
+
return
|
18
|
+
self.disabled = False
|
19
|
+
self.group = group
|
20
|
+
self.world_size = dist.get_world_size(self.group)
|
21
|
+
|
22
|
+
def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
|
23
|
+
# FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
|
24
|
+
# occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
|
25
|
+
# (which is required for tensor parallel HPUGraph inference)
|
26
|
+
htorch.core.mark_step()
|
27
|
+
dist.all_reduce(x, group=self.group)
|
28
|
+
return x
|
29
|
+
|
30
|
+
def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
|
31
|
+
world_size = self.world_size
|
32
|
+
if dim < 0:
|
33
|
+
# Convert negative dim to positive.
|
34
|
+
dim += x.dim()
|
35
|
+
input_size = x.size()
|
36
|
+
# Allocate output tensor.
|
37
|
+
output_tensor = torch.empty(
|
38
|
+
(world_size,) + input_size, dtype=x.dtype, device=x.device
|
39
|
+
)
|
40
|
+
# All-gather.
|
41
|
+
htorch.core.mark_step()
|
42
|
+
dist.all_gather_into_tensor(output_tensor, x, group=self.group)
|
43
|
+
# Reshape
|
44
|
+
output_tensor = output_tensor.movedim(0, dim)
|
45
|
+
output_tensor = output_tensor.reshape(
|
46
|
+
input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
|
47
|
+
)
|
48
|
+
return output_tensor
|