sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +21 -23
- sglang/api.py +2 -7
- sglang/bench_offline_throughput.py +41 -27
- sglang/bench_one_batch.py +60 -4
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +83 -71
- sglang/lang/backend/runtime_endpoint.py +183 -4
- sglang/lang/chat_template.py +46 -4
- sglang/launch_server.py +1 -1
- sglang/srt/_custom_ops.py +80 -42
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +21 -0
- sglang/srt/constrained/xgrammar_backend.py +8 -4
- sglang/srt/conversation.py +14 -1
- sglang/srt/distributed/__init__.py +3 -3
- sglang/srt/distributed/communication_op.py +2 -1
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- sglang/srt/distributed/parallel_state.py +1 -1
- sglang/srt/distributed/utils.py +2 -1
- sglang/srt/entrypoints/engine.py +452 -0
- sglang/srt/entrypoints/http_server.py +603 -0
- sglang/srt/function_call_parser.py +494 -0
- sglang/srt/layers/activation.py +8 -8
- sglang/srt/layers/attention/flashinfer_backend.py +10 -9
- sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang/srt/layers/attention/vision.py +204 -0
- sglang/srt/layers/dp_attention.py +71 -0
- sglang/srt/layers/layernorm.py +5 -5
- sglang/srt/layers/linear.py +65 -14
- sglang/srt/layers/logits_processor.py +49 -64
- sglang/srt/layers/moe/ep_moe/layer.py +24 -16
- sglang/srt/layers/moe/fused_moe_native.py +84 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
- sglang/srt/layers/parameter.py +18 -8
- sglang/srt/layers/quantization/__init__.py +20 -23
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/fp8.py +10 -4
- sglang/srt/layers/quantization/modelopt_quant.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -2
- sglang/srt/layers/rotary_embedding.py +1184 -31
- sglang/srt/layers/sampler.py +64 -6
- sglang/srt/layers/torchao_utils.py +12 -6
- sglang/srt/layers/vocab_parallel_embedding.py +2 -2
- sglang/srt/lora/lora.py +1 -9
- sglang/srt/managers/configure_logging.py +3 -0
- sglang/srt/managers/data_parallel_controller.py +79 -72
- sglang/srt/managers/detokenizer_manager.py +24 -6
- sglang/srt/managers/image_processor.py +158 -2
- sglang/srt/managers/io_struct.py +57 -3
- sglang/srt/managers/schedule_batch.py +78 -45
- sglang/srt/managers/schedule_policy.py +26 -12
- sglang/srt/managers/scheduler.py +326 -201
- sglang/srt/managers/session_controller.py +1 -0
- sglang/srt/managers/tokenizer_manager.py +210 -121
- sglang/srt/managers/tp_worker.py +6 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang/srt/managers/utils.py +44 -0
- sglang/srt/mem_cache/memory_pool.py +10 -32
- sglang/srt/metrics/collector.py +15 -6
- sglang/srt/model_executor/cuda_graph_runner.py +26 -30
- sglang/srt/model_executor/forward_batch_info.py +5 -7
- sglang/srt/model_executor/model_runner.py +44 -19
- sglang/srt/model_loader/loader.py +83 -6
- sglang/srt/model_loader/weight_utils.py +145 -6
- sglang/srt/models/baichuan.py +6 -6
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +17 -5
- sglang/srt/models/dbrx.py +13 -5
- sglang/srt/models/deepseek.py +3 -3
- sglang/srt/models/deepseek_v2.py +11 -11
- sglang/srt/models/exaone.py +2 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +15 -25
- sglang/srt/models/gpt2.py +3 -5
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/granite.py +2 -2
- sglang/srt/models/grok.py +4 -3
- sglang/srt/models/internlm2.py +2 -2
- sglang/srt/models/llama.py +7 -5
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/minicpm3.py +9 -9
- sglang/srt/models/minicpmv.py +1238 -0
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mixtral_quant.py +3 -3
- sglang/srt/models/mllama.py +2 -2
- sglang/srt/models/olmo.py +3 -3
- sglang/srt/models/olmo2.py +4 -4
- sglang/srt/models/olmoe.py +7 -13
- sglang/srt/models/phi3_small.py +2 -2
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +41 -4
- sglang/srt/models/qwen2_moe.py +3 -3
- sglang/srt/models/qwen2_vl.py +22 -122
- sglang/srt/models/stablelm.py +2 -2
- sglang/srt/models/torch_native_llama.py +20 -7
- sglang/srt/models/xverse.py +6 -6
- sglang/srt/models/xverse_moe.py +6 -6
- sglang/srt/openai_api/adapter.py +139 -37
- sglang/srt/openai_api/protocol.py +7 -4
- sglang/srt/sampling/custom_logit_processor.py +38 -0
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
- sglang/srt/sampling/sampling_batch_info.py +143 -18
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +4 -1090
- sglang/srt/server_args.py +77 -15
- sglang/srt/speculative/eagle_utils.py +37 -15
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/utils.py +164 -129
- sglang/test/runners.py +8 -13
- sglang/test/test_programs.py +2 -1
- sglang/test/test_utils.py +83 -22
- sglang/utils.py +12 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
- sglang/launch_server_llavavid.py +0 -25
- sglang/srt/constrained/__init__.py +0 -16
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
# Adapted from https://github.com/vllm-project/vllm/blob/
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
|
2
2
|
|
3
3
|
# This file is a pure Python wrapper for the NCCL library.
|
4
4
|
# The main purpose is to use NCCL combined with CUDA graph.
|
@@ -57,7 +57,7 @@ def find_nccl_library() -> str:
|
|
57
57
|
so_file = "librccl.so.1"
|
58
58
|
else:
|
59
59
|
raise ValueError("NCCL only supports CUDA and ROCm backends.")
|
60
|
-
logger.
|
60
|
+
logger.debug("Found nccl from library %s", so_file)
|
61
61
|
return so_file
|
62
62
|
|
63
63
|
|
@@ -187,6 +187,43 @@ class NCCLLibrary:
|
|
187
187
|
cudaStream_t,
|
188
188
|
],
|
189
189
|
),
|
190
|
+
# ncclResult_t ncclAllGather(
|
191
|
+
# const void* sendbuff, void* recvbuff, size_t count,
|
192
|
+
# ncclDataType_t datatype, ncclComm_t comm,
|
193
|
+
# cudaStream_t stream);
|
194
|
+
# note that cudaStream_t is a pointer type, so the last argument
|
195
|
+
# is a pointer
|
196
|
+
Function(
|
197
|
+
"ncclAllGather",
|
198
|
+
ncclResult_t,
|
199
|
+
[
|
200
|
+
buffer_type,
|
201
|
+
buffer_type,
|
202
|
+
ctypes.c_size_t,
|
203
|
+
ncclDataType_t,
|
204
|
+
ncclComm_t,
|
205
|
+
cudaStream_t,
|
206
|
+
],
|
207
|
+
),
|
208
|
+
# ncclResult_t ncclReduceScatter(
|
209
|
+
# const void* sendbuff, void* recvbuff, size_t count,
|
210
|
+
# ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
211
|
+
# cudaStream_t stream);
|
212
|
+
# note that cudaStream_t is a pointer type, so the last argument
|
213
|
+
# is a pointer
|
214
|
+
Function(
|
215
|
+
"ncclReduceScatter",
|
216
|
+
ncclResult_t,
|
217
|
+
[
|
218
|
+
buffer_type,
|
219
|
+
buffer_type,
|
220
|
+
ctypes.c_size_t,
|
221
|
+
ncclDataType_t,
|
222
|
+
ncclRedOp_t,
|
223
|
+
ncclComm_t,
|
224
|
+
cudaStream_t,
|
225
|
+
],
|
226
|
+
),
|
190
227
|
# ncclResult_t ncclSend(
|
191
228
|
# const void* sendbuff, size_t count, ncclDataType_t datatype,
|
192
229
|
# int dest, ncclComm_t comm, cudaStream_t stream);
|
@@ -217,6 +254,23 @@ class NCCLLibrary:
|
|
217
254
|
cudaStream_t,
|
218
255
|
],
|
219
256
|
),
|
257
|
+
# ncclResult_t ncclBroadcast(
|
258
|
+
# const void* sendbuff, void* recvbuff, size_t count,
|
259
|
+
# ncclDataType_t datatype, int root, ncclComm_t comm,
|
260
|
+
# cudaStream_t stream);
|
261
|
+
Function(
|
262
|
+
"ncclBroadcast",
|
263
|
+
ncclResult_t,
|
264
|
+
[
|
265
|
+
buffer_type,
|
266
|
+
buffer_type,
|
267
|
+
ctypes.c_size_t,
|
268
|
+
ncclDataType_t,
|
269
|
+
ctypes.c_int,
|
270
|
+
ncclComm_t,
|
271
|
+
cudaStream_t,
|
272
|
+
],
|
273
|
+
),
|
220
274
|
# be cautious! this is a collective call, it will block until all
|
221
275
|
# processes in the communicator have called this function.
|
222
276
|
# because Python object destruction can happen in random order,
|
@@ -321,6 +375,46 @@ class NCCLLibrary:
|
|
321
375
|
)
|
322
376
|
)
|
323
377
|
|
378
|
+
def ncclReduceScatter(
|
379
|
+
self,
|
380
|
+
sendbuff: buffer_type,
|
381
|
+
recvbuff: buffer_type,
|
382
|
+
count: int,
|
383
|
+
datatype: int,
|
384
|
+
op: int,
|
385
|
+
comm: ncclComm_t,
|
386
|
+
stream: cudaStream_t,
|
387
|
+
) -> None:
|
388
|
+
# `datatype` actually should be `ncclDataType_t`
|
389
|
+
# and `op` should be `ncclRedOp_t`
|
390
|
+
# both are aliases of `ctypes.c_int`
|
391
|
+
# when we pass int to a function, it will be converted to `ctypes.c_int`
|
392
|
+
# by ctypes automatically
|
393
|
+
self.NCCL_CHECK(
|
394
|
+
self._funcs["ncclReduceScatter"](
|
395
|
+
sendbuff, recvbuff, count, datatype, op, comm, stream
|
396
|
+
)
|
397
|
+
)
|
398
|
+
|
399
|
+
def ncclAllGather(
|
400
|
+
self,
|
401
|
+
sendbuff: buffer_type,
|
402
|
+
recvbuff: buffer_type,
|
403
|
+
count: int,
|
404
|
+
datatype: int,
|
405
|
+
comm: ncclComm_t,
|
406
|
+
stream: cudaStream_t,
|
407
|
+
) -> None:
|
408
|
+
# `datatype` actually should be `ncclDataType_t`
|
409
|
+
# which is an aliases of `ctypes.c_int`
|
410
|
+
# when we pass int to a function, it will be converted to `ctypes.c_int`
|
411
|
+
# by ctypes automatically
|
412
|
+
self.NCCL_CHECK(
|
413
|
+
self._funcs["ncclAllGather"](
|
414
|
+
sendbuff, recvbuff, count, datatype, comm, stream
|
415
|
+
)
|
416
|
+
)
|
417
|
+
|
324
418
|
def ncclSend(
|
325
419
|
self,
|
326
420
|
sendbuff: buffer_type,
|
@@ -347,6 +441,22 @@ class NCCLLibrary:
|
|
347
441
|
self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
|
348
442
|
)
|
349
443
|
|
444
|
+
def ncclBroadcast(
|
445
|
+
self,
|
446
|
+
sendbuff: buffer_type,
|
447
|
+
recvbuff: buffer_type,
|
448
|
+
count: int,
|
449
|
+
datatype: int,
|
450
|
+
root: int,
|
451
|
+
comm: ncclComm_t,
|
452
|
+
stream: cudaStream_t,
|
453
|
+
) -> None:
|
454
|
+
self.NCCL_CHECK(
|
455
|
+
self._funcs["ncclBroadcast"](
|
456
|
+
sendbuff, recvbuff, count, datatype, root, comm, stream
|
457
|
+
)
|
458
|
+
)
|
459
|
+
|
350
460
|
def ncclCommDestroy(self, comm: ncclComm_t) -> None:
|
351
461
|
self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
|
352
462
|
|
@@ -1,11 +1,9 @@
|
|
1
|
-
# Adapted from https://github.com/vllm-project/vllm/blob/
|
2
|
-
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/shm_broadcast.py
|
2
|
+
|
3
3
|
import logging
|
4
4
|
import os
|
5
5
|
import pickle
|
6
|
-
import socket
|
7
6
|
import time
|
8
|
-
import warnings
|
9
7
|
from contextlib import contextmanager
|
10
8
|
from dataclasses import dataclass, field
|
11
9
|
from multiprocessing import shared_memory
|
@@ -18,6 +16,8 @@ from torch.distributed import ProcessGroup
|
|
18
16
|
from zmq import IPV6 # type: ignore
|
19
17
|
from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context # type: ignore
|
20
18
|
|
19
|
+
from sglang.srt.utils import get_ip, get_open_port, is_valid_ipv6_address
|
20
|
+
|
21
21
|
# SGLANG_RINGBUFFER_WARNING_INTERVAL can be set to 60
|
22
22
|
SGLANG_RINGBUFFER_WARNING_INTERVAL = int(
|
23
23
|
os.environ.get("SGLANG_RINGBUFFER_WARNING_INTERVAL", "60")
|
@@ -26,73 +26,6 @@ SGLANG_RINGBUFFER_WARNING_INTERVAL = int(
|
|
26
26
|
logger = logging.getLogger(__name__)
|
27
27
|
|
28
28
|
|
29
|
-
def get_ip() -> str:
|
30
|
-
# SGLANG_HOST_IP env can be ignore
|
31
|
-
host_ip = os.getenv("SGLANG_HOST_IP", "") or os.getenv("HOST_IP", "")
|
32
|
-
if host_ip:
|
33
|
-
return host_ip
|
34
|
-
|
35
|
-
# IP is not set, try to get it from the network interface
|
36
|
-
|
37
|
-
# try ipv4
|
38
|
-
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
39
|
-
try:
|
40
|
-
s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable
|
41
|
-
return s.getsockname()[0]
|
42
|
-
except Exception:
|
43
|
-
pass
|
44
|
-
|
45
|
-
# try ipv6
|
46
|
-
try:
|
47
|
-
s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
48
|
-
# Google's public DNS server, see
|
49
|
-
# https://developers.google.com/speed/public-dns/docs/using#addresses
|
50
|
-
s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
|
51
|
-
return s.getsockname()[0]
|
52
|
-
except Exception:
|
53
|
-
pass
|
54
|
-
|
55
|
-
warnings.warn(
|
56
|
-
"Failed to get the IP address, using 0.0.0.0 by default."
|
57
|
-
"The value can be set by the environment variable"
|
58
|
-
" SGLANG_HOST_IP or HOST_IP.",
|
59
|
-
stacklevel=2,
|
60
|
-
)
|
61
|
-
return "0.0.0.0"
|
62
|
-
|
63
|
-
|
64
|
-
def get_open_port() -> int:
|
65
|
-
|
66
|
-
port = os.getenv("SGLANG_PORT")
|
67
|
-
if port is not None:
|
68
|
-
while True:
|
69
|
-
try:
|
70
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
71
|
-
s.bind(("", port))
|
72
|
-
return port
|
73
|
-
except OSError:
|
74
|
-
port += 1 # Increment port number if already in use
|
75
|
-
logger.info("Port %d is already in use, trying port %d", port - 1, port)
|
76
|
-
# try ipv4
|
77
|
-
try:
|
78
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
79
|
-
s.bind(("", 0))
|
80
|
-
return s.getsockname()[1]
|
81
|
-
except OSError:
|
82
|
-
# try ipv6
|
83
|
-
with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
|
84
|
-
s.bind(("", 0))
|
85
|
-
return s.getsockname()[1]
|
86
|
-
|
87
|
-
|
88
|
-
def is_valid_ipv6_address(address: str) -> bool:
|
89
|
-
try:
|
90
|
-
ipaddress.IPv6Address(address)
|
91
|
-
return True
|
92
|
-
except ValueError:
|
93
|
-
return False
|
94
|
-
|
95
|
-
|
96
29
|
class ShmRingBuffer:
|
97
30
|
|
98
31
|
def __init__(
|
@@ -313,7 +246,7 @@ class MessageQueue:
|
|
313
246
|
remote_subscribe_port=remote_subscribe_port,
|
314
247
|
)
|
315
248
|
|
316
|
-
logger.
|
249
|
+
logger.debug("Message queue communication handle: %s", self.handle)
|
317
250
|
|
318
251
|
def export_handle(self) -> Handle:
|
319
252
|
return self.handle
|
@@ -1,4 +1,5 @@
|
|
1
|
-
# Adapted from https://github.com/vllm-project/vllm/blob/
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/xpu_communicator.py
|
2
|
+
|
2
3
|
import torch
|
3
4
|
import torch.distributed as dist
|
4
5
|
from torch.distributed import ProcessGroup
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Adapted from https://github.com/vllm-project/vllm/blob/
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/parallel_state.py
|
2
2
|
|
3
3
|
# Copyright 2023 The vLLM team.
|
4
4
|
# Adapted from
|
sglang/srt/distributed/utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
# Adapted from https://github.com/vllm-project/vllm/blob/
|
1
|
+
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/utils.py
|
2
|
+
|
2
3
|
# Copyright 2023 The vLLM team.
|
3
4
|
# Adapted from
|
4
5
|
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
|