sglang 0.3.6.post2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sglang/bench_offline_throughput.py +55 -2
  2. sglang/bench_one_batch.py +7 -6
  3. sglang/bench_one_batch_server.py +4 -3
  4. sglang/bench_serving.py +13 -0
  5. sglang/check_env.py +1 -1
  6. sglang/launch_server.py +3 -2
  7. sglang/srt/_custom_ops.py +118 -0
  8. sglang/srt/configs/device_config.py +17 -0
  9. sglang/srt/configs/load_config.py +84 -0
  10. sglang/srt/configs/model_config.py +161 -4
  11. sglang/srt/configs/qwen2vl.py +5 -8
  12. sglang/srt/constrained/outlines_backend.py +6 -1
  13. sglang/srt/constrained/outlines_jump_forward.py +8 -1
  14. sglang/srt/distributed/__init__.py +3 -0
  15. sglang/srt/distributed/communication_op.py +34 -0
  16. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  17. sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
  18. sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
  19. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
  20. sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
  21. sglang/srt/distributed/device_communicators/pynccl.py +204 -0
  22. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
  23. sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
  24. sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
  25. sglang/srt/distributed/parallel_state.py +1275 -0
  26. sglang/srt/distributed/utils.py +223 -0
  27. sglang/srt/hf_transformers_utils.py +37 -1
  28. sglang/srt/layers/attention/flashinfer_backend.py +13 -15
  29. sglang/srt/layers/attention/torch_native_backend.py +285 -0
  30. sglang/srt/layers/fused_moe_patch.py +20 -11
  31. sglang/srt/layers/linear.py +1 -0
  32. sglang/srt/layers/logits_processor.py +17 -3
  33. sglang/srt/layers/quantization/__init__.py +34 -0
  34. sglang/srt/layers/vocab_parallel_embedding.py +1 -0
  35. sglang/srt/lora/lora.py +1 -1
  36. sglang/srt/managers/data_parallel_controller.py +7 -11
  37. sglang/srt/managers/detokenizer_manager.py +7 -4
  38. sglang/srt/managers/image_processor.py +1 -1
  39. sglang/srt/managers/io_struct.py +48 -12
  40. sglang/srt/managers/schedule_batch.py +42 -36
  41. sglang/srt/managers/schedule_policy.py +7 -4
  42. sglang/srt/managers/scheduler.py +111 -46
  43. sglang/srt/managers/session_controller.py +0 -3
  44. sglang/srt/managers/tokenizer_manager.py +169 -100
  45. sglang/srt/managers/tp_worker.py +36 -3
  46. sglang/srt/managers/tp_worker_overlap_thread.py +32 -5
  47. sglang/srt/model_executor/cuda_graph_runner.py +16 -7
  48. sglang/srt/model_executor/forward_batch_info.py +9 -4
  49. sglang/srt/model_executor/model_runner.py +136 -150
  50. sglang/srt/model_loader/__init__.py +34 -0
  51. sglang/srt/model_loader/loader.py +1139 -0
  52. sglang/srt/model_loader/utils.py +41 -0
  53. sglang/srt/model_loader/weight_utils.py +640 -0
  54. sglang/srt/models/baichuan.py +9 -10
  55. sglang/srt/models/chatglm.py +6 -15
  56. sglang/srt/models/commandr.py +2 -3
  57. sglang/srt/models/dbrx.py +2 -3
  58. sglang/srt/models/deepseek.py +4 -11
  59. sglang/srt/models/deepseek_v2.py +3 -11
  60. sglang/srt/models/exaone.py +2 -3
  61. sglang/srt/models/gemma.py +2 -6
  62. sglang/srt/models/gemma2.py +3 -14
  63. sglang/srt/models/gemma2_reward.py +0 -1
  64. sglang/srt/models/gpt2.py +5 -12
  65. sglang/srt/models/gpt_bigcode.py +6 -22
  66. sglang/srt/models/grok.py +14 -51
  67. sglang/srt/models/internlm2.py +2 -3
  68. sglang/srt/models/internlm2_reward.py +0 -1
  69. sglang/srt/models/llama.py +97 -27
  70. sglang/srt/models/llama_classification.py +1 -2
  71. sglang/srt/models/llama_embedding.py +1 -2
  72. sglang/srt/models/llama_reward.py +2 -3
  73. sglang/srt/models/llava.py +10 -12
  74. sglang/srt/models/llavavid.py +1 -2
  75. sglang/srt/models/minicpm.py +4 -7
  76. sglang/srt/models/minicpm3.py +6 -19
  77. sglang/srt/models/mixtral.py +12 -5
  78. sglang/srt/models/mixtral_quant.py +2 -3
  79. sglang/srt/models/mllama.py +3 -7
  80. sglang/srt/models/olmo.py +2 -8
  81. sglang/srt/models/olmo2.py +391 -0
  82. sglang/srt/models/olmoe.py +3 -5
  83. sglang/srt/models/phi3_small.py +8 -8
  84. sglang/srt/models/qwen.py +2 -3
  85. sglang/srt/models/qwen2.py +10 -9
  86. sglang/srt/models/qwen2_moe.py +4 -11
  87. sglang/srt/models/qwen2_vl.py +12 -9
  88. sglang/srt/models/registry.py +99 -0
  89. sglang/srt/models/stablelm.py +2 -3
  90. sglang/srt/models/torch_native_llama.py +6 -12
  91. sglang/srt/models/xverse.py +2 -4
  92. sglang/srt/models/xverse_moe.py +4 -11
  93. sglang/srt/models/yivl.py +2 -3
  94. sglang/srt/openai_api/adapter.py +10 -6
  95. sglang/srt/openai_api/protocol.py +1 -0
  96. sglang/srt/server.py +303 -204
  97. sglang/srt/server_args.py +65 -31
  98. sglang/srt/utils.py +253 -48
  99. sglang/test/test_utils.py +27 -7
  100. sglang/utils.py +2 -2
  101. sglang/version.py +1 -1
  102. {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/METADATA +2 -1
  103. sglang-0.4.0.dist-info/RECORD +184 -0
  104. sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
  105. sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
  106. sglang/srt/layers/fused_moe_grok/layer.py +0 -630
  107. sglang-0.3.6.post2.dist-info/RECORD +0 -164
  108. {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
  109. {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
  110. {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,352 @@
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/custom_all_reduce.py
2
+ import ctypes
3
+ import logging
4
+ import os
5
+ from contextlib import contextmanager
6
+ from functools import wraps
7
+ from typing import Callable, List, Optional, TypeVar, Union
8
+
9
+ import pynvml
10
+ import torch
11
+ import torch.distributed as dist
12
+ from torch.distributed import ProcessGroup
13
+ from typing_extensions import ParamSpec
14
+
15
+ from sglang.srt import _custom_ops as ops
16
+ from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
17
+ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import (
18
+ gpu_p2p_access_check,
19
+ )
20
+ from sglang.srt.distributed.parallel_state import in_the_same_node_as
21
+ from sglang.srt.utils import cuda_device_count_stateless, is_cuda
22
+
23
+ try:
24
+ ops.meta_size()
25
+ custom_ar = True
26
+ except Exception:
27
+ # For AMD GPUs and CPUs
28
+ custom_ar = False
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ _P = ParamSpec("_P")
34
+ _R = TypeVar("_R")
35
+
36
+
37
+ def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
38
+ @wraps(fn)
39
+ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
40
+ pynvml.nvmlInit()
41
+ try:
42
+ return fn(*args, **kwargs)
43
+ finally:
44
+ pynvml.nvmlShutdown()
45
+
46
+ return wrapper
47
+
48
+
49
+ @with_nvml_context
50
+ def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
51
+ """
52
+ query if the set of gpus are fully connected by nvlink (1 hop)
53
+ """
54
+ handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids]
55
+ for i, handle in enumerate(handles):
56
+ for j, peer_handle in enumerate(handles):
57
+ if i < j:
58
+ try:
59
+ p2p_status = pynvml.nvmlDeviceGetP2PStatus(
60
+ handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK
61
+ )
62
+ if p2p_status != pynvml.NVML_P2P_STATUS_OK:
63
+ return False
64
+ except pynvml.NVMLError:
65
+ logger.exception(
66
+ "NVLink detection failed. This is normal if your"
67
+ " machine has no NVLink equipped."
68
+ )
69
+ return False
70
+ return True
71
+
72
+
73
+ def _can_p2p(rank: int, world_size: int) -> bool:
74
+ # SGLANG_SKIP_P2P_CHECK can be set to False in sglang
75
+ SGLANG_SKIP_P2P_CHECK = os.getenv("SGLANG_SKIP_P2P_CHECK", "0") == "1"
76
+ for i in range(world_size):
77
+ if i == rank:
78
+ continue
79
+ if SGLANG_SKIP_P2P_CHECK:
80
+ logger.info("Skipping P2P check and trusting the driver's P2P report.")
81
+ return torch.cuda.can_device_access_peer(rank, i)
82
+ if not gpu_p2p_access_check(rank, i):
83
+ return False
84
+ return True
85
+
86
+
87
+ def is_weak_contiguous(inp: torch.Tensor):
88
+ return inp.is_contiguous() or (
89
+ inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
90
+ == inp.numel() * inp.element_size()
91
+ )
92
+
93
+
94
+ class CustomAllreduce:
95
+
96
+ _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
97
+
98
+ # max_size: max supported allreduce size
99
+ def __init__(
100
+ self,
101
+ group: ProcessGroup,
102
+ device: Union[int, str, torch.device],
103
+ max_size=8192 * 1024,
104
+ ) -> None:
105
+ """
106
+ Args:
107
+ group: the process group to work on. If None, it will use the
108
+ default process group.
109
+ device: the device to bind the CustomAllreduce to. If None,
110
+ it will be bind to f"cuda:{local_rank}".
111
+ It is the caller's responsibility to make sure each communicator
112
+ is bind to a unique device, and all communicators in this group
113
+ are in the same node.
114
+ """
115
+ self._IS_CAPTURING = False
116
+ self.disabled = True
117
+
118
+ if not custom_ar:
119
+ # disable because of missing custom allreduce library
120
+ # e.g. in a non-cuda environment
121
+ return
122
+
123
+ self.group = group
124
+
125
+ assert (
126
+ dist.get_backend(group) != dist.Backend.NCCL
127
+ ), "CustomAllreduce should be attached to a non-NCCL group."
128
+
129
+ if not all(in_the_same_node_as(group, source_rank=0)):
130
+ # No need to initialize custom allreduce for multi-node case.
131
+ logger.warning(
132
+ "Custom allreduce is disabled because this process group"
133
+ " spans across nodes."
134
+ )
135
+ return
136
+
137
+ rank = dist.get_rank(group=self.group)
138
+ world_size = dist.get_world_size(group=self.group)
139
+ if world_size == 1:
140
+ # No need to initialize custom allreduce for single GPU case.
141
+ return
142
+
143
+ if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
144
+ logger.warning(
145
+ "Custom allreduce is disabled due to an unsupported world"
146
+ " size: %d. Supported world sizes: %s. To silence this "
147
+ "warning, specify disable_custom_all_reduce=True explicitly.",
148
+ world_size,
149
+ str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
150
+ )
151
+ return
152
+
153
+ if isinstance(device, int):
154
+ device = torch.device(f"cuda:{device}")
155
+ elif isinstance(device, str):
156
+ device = torch.device(device)
157
+ # now `device` is a `torch.device` object
158
+ assert isinstance(device, torch.device)
159
+ self.device = device
160
+
161
+ cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
162
+ if cuda_visible_devices:
163
+ device_ids = list(map(int, cuda_visible_devices.split(",")))
164
+ else:
165
+ device_ids = list(range(cuda_device_count_stateless()))
166
+
167
+ physical_device_id = device_ids[device.index]
168
+ tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
169
+ gather_list = [
170
+ torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
171
+ ]
172
+ dist.all_gather(gather_list, tensor, group=self.group)
173
+ physical_device_ids = [t.item() for t in gather_list]
174
+
175
+ # test nvlink first, this will filter out most of the cases
176
+ # where custom allreduce is not supported
177
+ # this checks hardware and driver support for NVLink
178
+ assert is_cuda()
179
+
180
+ full_nvlink = is_full_nvlink(physical_device_ids)
181
+ if world_size > 2 and not full_nvlink:
182
+ logger.warning(
183
+ "Custom allreduce is disabled because it's not supported on"
184
+ " more than two PCIe-only GPUs. To silence this warning, "
185
+ "specify disable_custom_all_reduce=True explicitly."
186
+ )
187
+ return
188
+ # test P2P capability, this checks software/cudaruntime support
189
+ # this is expensive to compute at the first time
190
+ # then we cache the result
191
+ if not _can_p2p(rank, world_size):
192
+ logger.warning(
193
+ "Custom allreduce is disabled because your platform lacks "
194
+ "GPU P2P capability or P2P test failed. To silence this "
195
+ "warning, specify disable_custom_all_reduce=True explicitly."
196
+ )
197
+ return
198
+
199
+ self.disabled = False
200
+ # Buffers memory are owned by this Python class and passed to C++.
201
+ # Meta data composes of two parts: meta data for synchronization and a
202
+ # temporary buffer for storing intermediate allreduce results.
203
+ self.meta_ptrs = self.create_shared_buffer(
204
+ ops.meta_size() + max_size, group=group
205
+ )
206
+ # This is a pre-registered IPC buffer. In eager mode, input tensors
207
+ # are first copied into this buffer before allreduce is performed
208
+ self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
209
+ # This is a buffer for storing the tuples of pointers pointing to
210
+ # IPC buffers from all ranks. Each registered tuple has size of
211
+ # 8*world_size bytes where world_size is at most 8. Allocating 8MB
212
+ # is enough for 131072 such tuples. The largest model I've seen only
213
+ # needs less than 10000 of registered tuples.
214
+ self.rank_data = torch.empty(
215
+ 8 * 1024 * 1024, dtype=torch.uint8, device=self.device
216
+ )
217
+ self.max_size = max_size
218
+ self.rank = rank
219
+ self.world_size = world_size
220
+ self.full_nvlink = full_nvlink
221
+ self._ptr = ops.init_custom_ar(
222
+ self.meta_ptrs, self.rank_data, rank, self.full_nvlink
223
+ )
224
+ ops.register_buffer(self._ptr, self.buffer_ptrs)
225
+
226
+ @staticmethod
227
+ def create_shared_buffer(
228
+ size_in_bytes: int, group: Optional[ProcessGroup] = None
229
+ ) -> List[int]:
230
+ """
231
+ Creates a shared buffer and returns a list of pointers
232
+ representing the buffer on all processes in the group.
233
+ """
234
+ lib = CudaRTLibrary()
235
+ pointer = lib.cudaMalloc(size_in_bytes)
236
+ handle = lib.cudaIpcGetMemHandle(pointer)
237
+ world_size = dist.get_world_size(group=group)
238
+ rank = dist.get_rank(group=group)
239
+ handles = [None] * world_size
240
+ dist.all_gather_object(handles, handle, group=group)
241
+
242
+ pointers: List[int] = []
243
+ for i, h in enumerate(handles):
244
+ if i == rank:
245
+ pointers.append(pointer.value) # type: ignore
246
+ else:
247
+ pointers.append(lib.cudaIpcOpenMemHandle(h).value) # type: ignore
248
+
249
+ return pointers
250
+
251
+ @staticmethod
252
+ def free_shared_buffer(
253
+ pointers: List[int], group: Optional[ProcessGroup] = None
254
+ ) -> None:
255
+ rank = dist.get_rank(group=group)
256
+ lib = CudaRTLibrary()
257
+ lib.cudaFree(ctypes.c_void_p(pointers[rank]))
258
+
259
+ @contextmanager
260
+ def capture(self):
261
+ """
262
+ The main responsibility of this context manager is the
263
+ `register_graph_buffers` call at the end of the context.
264
+ It records all the buffer addresses used in the CUDA graph.
265
+ """
266
+ try:
267
+ self._IS_CAPTURING = True
268
+ yield
269
+ finally:
270
+ self._IS_CAPTURING = False
271
+ if not self.disabled:
272
+ self.register_graph_buffers()
273
+
274
+ def register_graph_buffers(self):
275
+ handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
276
+ logger.info("Registering %d cuda graph addresses", len(offset))
277
+ # We cannot directly use `dist.all_gather_object` here
278
+ # because it is incompatible with `gloo` backend under inference mode.
279
+ # see https://github.com/pytorch/pytorch/issues/126032 for details.
280
+ all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))]
281
+ all_data[self.rank] = [handle, offset]
282
+ ranks = sorted(dist.get_process_group_ranks(group=self.group))
283
+ for i, rank in enumerate(ranks):
284
+ dist.broadcast_object_list(
285
+ all_data[i], src=rank, group=self.group, device="cpu"
286
+ )
287
+ # Unpack list of tuples to tuple of lists.
288
+ handles = [d[0] for d in all_data] # type: ignore
289
+ offsets = [d[1] for d in all_data] # type: ignore
290
+ ops.register_graph_buffers(self._ptr, handles, offsets)
291
+
292
+ def should_custom_ar(self, inp: torch.Tensor):
293
+ if self.disabled:
294
+ return False
295
+ inp_size = inp.numel() * inp.element_size()
296
+ # custom allreduce requires input byte size to be multiples of 16
297
+ if inp_size % 16 != 0:
298
+ return False
299
+ if not is_weak_contiguous(inp):
300
+ return False
301
+ # for 4 or more non NVLink-capable GPUs, custom allreduce provides
302
+ # little performance improvement over NCCL.
303
+ if self.world_size == 2 or self.full_nvlink:
304
+ return inp_size < self.max_size
305
+ return False
306
+
307
+ def all_reduce(
308
+ self, inp: torch.Tensor, *, out: torch.Tensor = None, registered: bool = False
309
+ ):
310
+ """Performs an out-of-place all reduce.
311
+
312
+ If registered is True, this assumes inp's pointer is already
313
+ IPC-registered. Otherwise, inp is first copied into a pre-registered
314
+ buffer.
315
+ """
316
+ if out is None:
317
+ out = torch.empty_like(inp)
318
+ if registered:
319
+ ops.all_reduce(self._ptr, inp, out, 0, 0)
320
+ else:
321
+ ops.all_reduce(
322
+ self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
323
+ )
324
+ return out
325
+
326
+ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
327
+ """The main allreduce API that provides support for cuda graph."""
328
+ # When custom allreduce is disabled, this will be None.
329
+ if self.disabled or not self.should_custom_ar(input):
330
+ return None
331
+ if self._IS_CAPTURING:
332
+ if torch.cuda.is_current_stream_capturing():
333
+ return self.all_reduce(input, registered=True)
334
+ else:
335
+ # If warm up, mimic the allocation pattern since custom
336
+ # allreduce is out-of-place.
337
+ return torch.empty_like(input)
338
+ else:
339
+ # Note: outside of cuda graph context, custom allreduce incurs a
340
+ # cost of cudaMemcpy, which should be small (<=1% of overall
341
+ # latency) compared to the performance gain of using custom kernels
342
+ return self.all_reduce(input, registered=False)
343
+
344
+ def close(self):
345
+ if not self.disabled and self._ptr:
346
+ ops.dispose(self._ptr)
347
+ self._ptr = 0
348
+ self.free_shared_buffer(self.meta_ptrs)
349
+ self.free_shared_buffer(self.buffer_ptrs)
350
+
351
+ def __del__(self):
352
+ self.close()
@@ -0,0 +1,291 @@
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/custom_all_reduce_utils.py
2
+ import ctypes
3
+ import json
4
+ import logging
5
+ import os
6
+ import pickle
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from functools import lru_cache
11
+ from itertools import product
12
+ from typing import Dict, List, Optional, Sequence
13
+
14
+ import torch.distributed as dist
15
+ import torch.multiprocessing as mp
16
+
17
+ from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
18
+ from sglang.srt.utils import cuda_device_count_stateless
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def update_environment_variables(envs: Dict[str, str]):
24
+ for k, v in envs.items():
25
+ if k in os.environ and os.environ[k] != v:
26
+ logger.warning(
27
+ "Overwriting environment variable %s " "from '%s' to '%s'",
28
+ k,
29
+ os.environ[k],
30
+ v,
31
+ )
32
+ os.environ[k] = v
33
+
34
+
35
+ def producer(
36
+ batch_src: Sequence[int],
37
+ producer_queue,
38
+ consumer_queue,
39
+ result_queue,
40
+ cuda_visible_devices: Optional[str] = None,
41
+ ):
42
+ if cuda_visible_devices is not None:
43
+ update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
44
+
45
+ lib = CudaRTLibrary()
46
+ for i in batch_src:
47
+ lib.cudaSetDevice(i)
48
+ pointer = lib.cudaMalloc(1024)
49
+ lib.cudaMemset(pointer, 1, 1024)
50
+ lib.cudaDeviceSynchronize()
51
+ handle = lib.cudaIpcGetMemHandle(pointer)
52
+ producer_queue.put(handle)
53
+ open_success = consumer_queue.get()
54
+ if open_success:
55
+ # use two queues to simulate barrier
56
+ producer_queue.put(0)
57
+ consumer_queue.get()
58
+ # check if the memory is modified
59
+ host_data = (ctypes.c_char * 1024)()
60
+ lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore
61
+ for i in range(1024):
62
+ if ord(host_data[i]) != 2:
63
+ open_success = False
64
+ break
65
+ result_queue.put(open_success)
66
+ lib.cudaDeviceReset()
67
+
68
+
69
+ def consumer(
70
+ batch_tgt: Sequence[int],
71
+ producer_queue,
72
+ consumer_queue,
73
+ result_queue,
74
+ cuda_visible_devices: Optional[str] = None,
75
+ ):
76
+ if cuda_visible_devices is not None:
77
+ update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
78
+
79
+ lib = CudaRTLibrary()
80
+ for j in batch_tgt:
81
+ lib.cudaSetDevice(j)
82
+ handle = producer_queue.get()
83
+ open_success = False
84
+ try:
85
+ pointer = lib.cudaIpcOpenMemHandle(handle) # type: ignore
86
+ open_success = True
87
+ except RuntimeError:
88
+ # cannot error out here, because the producer process
89
+ # is still waiting for the response.
90
+ pass
91
+ consumer_queue.put(open_success)
92
+ if open_success:
93
+ # modify the memory
94
+ lib.cudaMemset(pointer, 2, 1024)
95
+ lib.cudaDeviceSynchronize()
96
+ # use two queues to simulate barrier
97
+ producer_queue.get()
98
+ consumer_queue.put(0)
99
+ # check if the memory is modified
100
+ host_data = (ctypes.c_char * 1024)()
101
+ lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore
102
+ for i in range(1024):
103
+ if ord(host_data[i]) != 2:
104
+ open_success = False
105
+ break
106
+ result_queue.put(open_success)
107
+ lib.cudaDeviceReset()
108
+
109
+
110
+ def can_actually_p2p(
111
+ batch_src: Sequence[int],
112
+ batch_tgt: Sequence[int],
113
+ ) -> Sequence[bool]:
114
+ """
115
+ Usually, checking if P2P access is enabled can be done by
116
+ `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
117
+ the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
118
+ returns `True` even if P2P access is not actually possible.
119
+ See https://github.com/vllm-project/vllm/issues/2728 and
120
+ https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
121
+ Therefore, we have to perform a real P2P access to check if it is actually
122
+ possible.
123
+
124
+ Note on p2p and cuda IPC:
125
+ Usually, one process uses one GPU:
126
+ GPU src --> cuda context src --> tensor src --> process src
127
+
128
+ We need to combine p2p and cuda IPC, so that:
129
+ GPU src --> cuda context src --> tensor src --> process src
130
+ |shared|
131
+ GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
132
+ That is to say, process src creates a tensor in GPU src, passes IPC handle to
133
+ process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
134
+ tensor in process tgt will be reflected in the tensor in process src, because
135
+ they are the same memory segment.
136
+ It is important to note that process tgt accesses the tensor in GPU tgt, not
137
+ GPU src. That's why we need p2p access.
138
+
139
+ The most time-consuming part is the process creation. To avoid creating
140
+ processes for every pair of GPUs, we use batched testing. We create two
141
+ processes for testing all pairs of GPUs in batch. The trick is to reset
142
+ the device after each test (which is not available in PyTorch).
143
+ """ # noqa
144
+ cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
145
+ # pass the CUDA_VISIBLE_DEVICES to the child process
146
+ # to make sure they see the same set of GPUs
147
+
148
+ # make sure the processes are spawned
149
+ smp = mp.get_context("spawn")
150
+ producer_queue = smp.Queue()
151
+ consumer_queue = smp.Queue()
152
+ result_queue = smp.Queue()
153
+ p_src = smp.Process(
154
+ target=producer,
155
+ args=(
156
+ batch_src,
157
+ producer_queue,
158
+ consumer_queue,
159
+ result_queue,
160
+ cuda_visible_devices,
161
+ ),
162
+ )
163
+ p_tgt = smp.Process(
164
+ target=consumer,
165
+ args=(
166
+ batch_tgt,
167
+ producer_queue,
168
+ consumer_queue,
169
+ result_queue,
170
+ cuda_visible_devices,
171
+ ),
172
+ )
173
+ p_src.start()
174
+ p_tgt.start()
175
+ p_src.join()
176
+ p_tgt.join()
177
+ assert p_src.exitcode == 0 and p_tgt.exitcode == 0
178
+ result: List[bool] = []
179
+ for src, tgt in zip(batch_src, batch_tgt):
180
+ a = result_queue.get()
181
+ b = result_queue.get()
182
+ if a != b:
183
+ logger.warning(
184
+ "Two processes do not agree on the P2P access"
185
+ " status on %d -> %d, treat as disabled.",
186
+ src,
187
+ tgt,
188
+ )
189
+ result.append(False)
190
+ else:
191
+ result.append(a)
192
+ return result
193
+
194
+
195
+ # why do we need this cache?
196
+ # we are testing peer-to-peer (p2p) access between GPUs,across processes.
197
+ # if we test it every time, it will be very slow, because we need to create
198
+ # N * N * 2 processes, where N is the world size. This is very slow.
199
+ # to reduce the time, we use a cache file to store the p2p access status.
200
+ # the cache file is generated by the master process if it does not exist.
201
+ # then all the processes can read the cache file to check the p2p access status.
202
+ # Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
203
+ # can have different cache files for different CUDA_VISIBLE_DEVICES settings,
204
+ # e.g. used by different vllm engines. The device id in the cache file is a
205
+ # **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
206
+ # of visible devices in the vllm engine.
207
+ _gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
208
+
209
+
210
+ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
211
+ """Check if GPU src can access GPU tgt."""
212
+
213
+ # if the cache variable is already calculated,
214
+ # read from the cache instead of checking it again
215
+ global _gpu_p2p_access_cache
216
+ if _gpu_p2p_access_cache is not None:
217
+ return _gpu_p2p_access_cache[f"{src}->{tgt}"]
218
+
219
+ is_distributed = dist.is_initialized()
220
+
221
+ num_dev = cuda_device_count_stateless()
222
+ cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
223
+ if cuda_visible_devices is None:
224
+ cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
225
+
226
+ # VLLM_CACHE_ROOT -> SGLANG_CACHE_ROOT
227
+ # "~/.cache/vllm" -> "~/.cache/sglang"
228
+ SGLANG_CACHE_ROOT = os.path.expanduser("~/.cache/sglang")
229
+ path = os.path.join(
230
+ SGLANG_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
231
+ )
232
+ os.makedirs(os.path.dirname(path), exist_ok=True)
233
+ from sglang.srt.distributed.parallel_state import get_world_group
234
+
235
+ if (not is_distributed or get_world_group().local_rank == 0) and (
236
+ not os.path.exists(path)
237
+ ):
238
+ # only the local master process (with local_rank == 0) can
239
+ # enter this block to calculate the cache
240
+ logger.info("generating GPU P2P access cache in %s", path)
241
+ cache: Dict[str, bool] = {}
242
+ ids = list(range(num_dev))
243
+ # batch of all pairs of GPUs
244
+ batch_src, batch_tgt = zip(*list(product(ids, ids)))
245
+ # NOTE: we use `subprocess` rather than `multiprocessing` here
246
+ # because the caller might not have `if __name__ == "__main__":`,
247
+ # in that case we cannot use spawn method in multiprocessing.
248
+ # However, `can_actually_p2p` requires spawn method.
249
+ # The fix is, we use `subprocess` to call the function,
250
+ # where we have `if __name__ == "__main__":` in this file.
251
+
252
+ # use a temporary file to store the result
253
+ # we don't use the output of the subprocess directly,
254
+ # because the subprocess might produce logging output
255
+ with tempfile.NamedTemporaryFile() as output_file:
256
+ input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
257
+ returned = subprocess.run(
258
+ [sys.executable, __file__], input=input_bytes, capture_output=True
259
+ )
260
+ # check if the subprocess is successful
261
+ try:
262
+ returned.check_returncode()
263
+ except Exception as e:
264
+ # wrap raised exception to provide more information
265
+ raise RuntimeError(
266
+ f"Error happened when batch testing "
267
+ f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
268
+ f"{returned.stderr.decode()}"
269
+ ) from e
270
+ with open(output_file.name, "rb") as f:
271
+ result = pickle.load(f)
272
+ for _i, _j, r in zip(batch_src, batch_tgt, result):
273
+ cache[f"{_i}->{_j}"] = r
274
+ with open(path, "w") as f:
275
+ json.dump(cache, f, indent=4)
276
+ if is_distributed:
277
+ get_world_group().barrier()
278
+ logger.info("reading GPU P2P access cache from %s", path)
279
+ with open(path) as f:
280
+ cache = json.load(f)
281
+ _gpu_p2p_access_cache = cache
282
+ return _gpu_p2p_access_cache[f"{src}->{tgt}"]
283
+
284
+
285
+ __all__ = ["gpu_p2p_access_check"]
286
+
287
+ if __name__ == "__main__":
288
+ batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
289
+ result = can_actually_p2p(batch_src, batch_tgt)
290
+ with open(output_file, "wb") as f:
291
+ f.write(pickle.dumps(result))
@@ -0,0 +1,48 @@
1
+ # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/distributed/device_communicators/hpu_communicator.py
2
+ import torch
3
+ import torch.distributed as dist
4
+ from torch.distributed import ProcessGroup
5
+
6
+ from sglang.srt.utils import is_hpu
7
+
8
+ if is_hpu():
9
+ import habana_frameworks.torch as htorch # noqa: F401
10
+
11
+
12
+ class HpuCommunicator:
13
+
14
+ def __init__(self, group: ProcessGroup):
15
+ if not is_hpu():
16
+ self.disabled = True
17
+ return
18
+ self.disabled = False
19
+ self.group = group
20
+ self.world_size = dist.get_world_size(self.group)
21
+
22
+ def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
23
+ # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
24
+ # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
25
+ # (which is required for tensor parallel HPUGraph inference)
26
+ htorch.core.mark_step()
27
+ dist.all_reduce(x, group=self.group)
28
+ return x
29
+
30
+ def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
31
+ world_size = self.world_size
32
+ if dim < 0:
33
+ # Convert negative dim to positive.
34
+ dim += x.dim()
35
+ input_size = x.size()
36
+ # Allocate output tensor.
37
+ output_tensor = torch.empty(
38
+ (world_size,) + input_size, dtype=x.dtype, device=x.device
39
+ )
40
+ # All-gather.
41
+ htorch.core.mark_step()
42
+ dist.all_gather_into_tensor(output_tensor, x, group=self.group)
43
+ # Reshape
44
+ output_tensor = output_tensor.movedim(0, dim)
45
+ output_tensor = output_tensor.reshape(
46
+ input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
47
+ )
48
+ return output_tensor