sglang 0.4.1.post6__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +21 -23
- sglang/api.py +2 -7
- sglang/bench_offline_throughput.py +24 -16
- sglang/bench_one_batch.py +51 -3
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +37 -28
- sglang/lang/backend/runtime_endpoint.py +183 -4
- sglang/lang/chat_template.py +15 -4
- sglang/launch_server.py +1 -1
- sglang/srt/_custom_ops.py +80 -42
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +21 -0
- sglang/srt/constrained/xgrammar_backend.py +8 -4
- sglang/srt/conversation.py +14 -1
- sglang/srt/distributed/__init__.py +3 -3
- sglang/srt/distributed/communication_op.py +2 -1
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- sglang/srt/distributed/parallel_state.py +1 -1
- sglang/srt/distributed/utils.py +2 -1
- sglang/srt/entrypoints/engine.py +449 -0
- sglang/srt/entrypoints/http_server.py +579 -0
- sglang/srt/layers/activation.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +10 -9
- sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang/srt/layers/attention/vision.py +204 -0
- sglang/srt/layers/dp_attention.py +69 -0
- sglang/srt/layers/linear.py +41 -5
- sglang/srt/layers/logits_processor.py +48 -63
- sglang/srt/layers/moe/ep_moe/layer.py +4 -4
- sglang/srt/layers/moe/fused_moe_native.py +69 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
- sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
- sglang/srt/layers/parameter.py +2 -1
- sglang/srt/layers/quantization/__init__.py +20 -23
- sglang/srt/layers/quantization/fp8.py +6 -3
- sglang/srt/layers/quantization/modelopt_quant.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -2
- sglang/srt/layers/rotary_embedding.py +1179 -31
- sglang/srt/layers/sampler.py +39 -1
- sglang/srt/layers/vocab_parallel_embedding.py +2 -2
- sglang/srt/lora/lora.py +1 -9
- sglang/srt/managers/configure_logging.py +3 -0
- sglang/srt/managers/data_parallel_controller.py +79 -72
- sglang/srt/managers/detokenizer_manager.py +23 -6
- sglang/srt/managers/image_processor.py +158 -2
- sglang/srt/managers/io_struct.py +25 -2
- sglang/srt/managers/schedule_batch.py +49 -22
- sglang/srt/managers/schedule_policy.py +26 -12
- sglang/srt/managers/scheduler.py +277 -178
- sglang/srt/managers/session_controller.py +1 -0
- sglang/srt/managers/tokenizer_manager.py +206 -121
- sglang/srt/managers/tp_worker.py +6 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang/srt/managers/utils.py +44 -0
- sglang/srt/mem_cache/memory_pool.py +10 -32
- sglang/srt/metrics/collector.py +15 -6
- sglang/srt/model_executor/cuda_graph_runner.py +4 -6
- sglang/srt/model_executor/model_runner.py +37 -15
- sglang/srt/model_loader/loader.py +8 -6
- sglang/srt/model_loader/weight_utils.py +55 -2
- sglang/srt/models/baichuan.py +6 -6
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +3 -3
- sglang/srt/models/dbrx.py +4 -4
- sglang/srt/models/deepseek.py +3 -3
- sglang/srt/models/deepseek_v2.py +8 -8
- sglang/srt/models/exaone.py +2 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +6 -24
- sglang/srt/models/gpt2.py +3 -5
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/granite.py +2 -2
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/internlm2.py +2 -2
- sglang/srt/models/llama.py +7 -5
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/minicpm3.py +6 -6
- sglang/srt/models/minicpmv.py +1238 -0
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mixtral_quant.py +3 -3
- sglang/srt/models/mllama.py +2 -2
- sglang/srt/models/olmo.py +3 -3
- sglang/srt/models/olmo2.py +4 -4
- sglang/srt/models/olmoe.py +7 -13
- sglang/srt/models/phi3_small.py +2 -2
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +41 -4
- sglang/srt/models/qwen2_moe.py +3 -3
- sglang/srt/models/qwen2_vl.py +22 -122
- sglang/srt/models/stablelm.py +2 -2
- sglang/srt/models/torch_native_llama.py +3 -3
- sglang/srt/models/xverse.py +6 -6
- sglang/srt/models/xverse_moe.py +6 -6
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/custom_logit_processor.py +38 -0
- sglang/srt/sampling/sampling_batch_info.py +139 -4
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +4 -1090
- sglang/srt/server_args.py +57 -14
- sglang/srt/utils.py +103 -65
- sglang/test/runners.py +8 -13
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +3 -1
- sglang/utils.py +12 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +16 -5
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +119 -115
- sglang/launch_server_llavavid.py +0 -25
- sglang/srt/constrained/__init__.py +0 -16
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -29,8 +29,8 @@ from sglang.srt.utils import (
|
|
29
29
|
get_nvgpu_memory_capacity,
|
30
30
|
is_flashinfer_available,
|
31
31
|
is_hip,
|
32
|
-
is_ipv6,
|
33
32
|
is_port_available,
|
33
|
+
is_valid_ipv6_address,
|
34
34
|
nullable_str,
|
35
35
|
)
|
36
36
|
|
@@ -157,6 +157,10 @@ class ServerArgs:
|
|
157
157
|
num_continuous_decode_steps: int = 1
|
158
158
|
delete_ckpt_after_loading: bool = False
|
159
159
|
enable_memory_saver: bool = False
|
160
|
+
allow_auto_truncate: bool = False
|
161
|
+
|
162
|
+
# Custom logit processor
|
163
|
+
enable_custom_logit_processor: bool = False
|
160
164
|
|
161
165
|
def __post_init__(self):
|
162
166
|
# Set missing default values
|
@@ -240,14 +244,13 @@ class ServerArgs:
|
|
240
244
|
# Others
|
241
245
|
if self.enable_dp_attention:
|
242
246
|
self.dp_size = self.tp_size
|
247
|
+
assert self.tp_size % self.dp_size == 0
|
243
248
|
self.chunked_prefill_size = self.chunked_prefill_size // 2
|
244
249
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
245
|
-
self.disable_overlap_schedule = True
|
246
250
|
logger.warning(
|
247
251
|
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
248
252
|
f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
|
249
253
|
"Data parallel size is adjusted to be the same as tensor parallel size. "
|
250
|
-
"Overlap scheduler is disabled."
|
251
254
|
)
|
252
255
|
|
253
256
|
# Speculative Decoding
|
@@ -392,7 +395,7 @@ class ServerArgs:
|
|
392
395
|
"--device",
|
393
396
|
type=str,
|
394
397
|
default="cuda",
|
395
|
-
choices=["cuda", "xpu", "hpu"],
|
398
|
+
choices=["cuda", "xpu", "hpu", "cpu"],
|
396
399
|
help="The device type.",
|
397
400
|
)
|
398
401
|
parser.add_argument(
|
@@ -860,6 +863,16 @@ class ServerArgs:
|
|
860
863
|
action="store_true",
|
861
864
|
help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
|
862
865
|
)
|
866
|
+
parser.add_argument(
|
867
|
+
"--allow-auto-truncate",
|
868
|
+
action="store_true",
|
869
|
+
help="Allow automatically truncating requests that exceed the maximum input length instead of returning an error.",
|
870
|
+
)
|
871
|
+
parser.add_argument(
|
872
|
+
"--enable-custom-logit-processor",
|
873
|
+
action="store_true",
|
874
|
+
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
875
|
+
)
|
863
876
|
|
864
877
|
@classmethod
|
865
878
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -870,7 +883,7 @@ class ServerArgs:
|
|
870
883
|
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
871
884
|
|
872
885
|
def url(self):
|
873
|
-
if
|
886
|
+
if is_valid_ipv6_address(self.host):
|
874
887
|
return f"http://[{self.host}]:{self.port}"
|
875
888
|
else:
|
876
889
|
return f"http://{self.host}:{self.port}"
|
@@ -880,8 +893,8 @@ class ServerArgs:
|
|
880
893
|
self.tp_size % self.nnodes == 0
|
881
894
|
), "tp_size must be divisible by number of nodes"
|
882
895
|
assert not (
|
883
|
-
self.dp_size > 1 and self.nnodes != 1
|
884
|
-
), "multi-node data parallel is not supported"
|
896
|
+
self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
|
897
|
+
), "multi-node data parallel is not supported unless dp attention!"
|
885
898
|
assert (
|
886
899
|
self.max_loras_per_batch > 0
|
887
900
|
# FIXME
|
@@ -919,6 +932,9 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
|
|
919
932
|
return server_args
|
920
933
|
|
921
934
|
|
935
|
+
ZMQ_TCP_PORT_DELTA = 233
|
936
|
+
|
937
|
+
|
922
938
|
@dataclasses.dataclass
|
923
939
|
class PortArgs:
|
924
940
|
# The ipc filename for tokenizer to receive inputs from detokenizer (zmq)
|
@@ -932,7 +948,7 @@ class PortArgs:
|
|
932
948
|
nccl_port: int
|
933
949
|
|
934
950
|
@staticmethod
|
935
|
-
def init_new(server_args) -> "PortArgs":
|
951
|
+
def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
|
936
952
|
port = server_args.port + random.randint(100, 1000)
|
937
953
|
while True:
|
938
954
|
if is_port_available(port):
|
@@ -942,12 +958,39 @@ class PortArgs:
|
|
942
958
|
else:
|
943
959
|
port -= 43
|
944
960
|
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
961
|
+
if not server_args.enable_dp_attention:
|
962
|
+
# Normal case, use IPC within a single node
|
963
|
+
return PortArgs(
|
964
|
+
tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
965
|
+
scheduler_input_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
966
|
+
detokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
967
|
+
nccl_port=port,
|
968
|
+
)
|
969
|
+
else:
|
970
|
+
# DP attention. Use TCP + port to handle both single-node and multi-node.
|
971
|
+
if server_args.nnodes == 1 and server_args.dist_init_addr is None:
|
972
|
+
dist_init_addr = ("127.0.0.1", server_args.port + ZMQ_TCP_PORT_DELTA)
|
973
|
+
else:
|
974
|
+
dist_init_addr = server_args.dist_init_addr.split(":")
|
975
|
+
assert (
|
976
|
+
len(dist_init_addr) == 2
|
977
|
+
), "please provide --dist-init-addr as host:port of head node"
|
978
|
+
|
979
|
+
dist_init_host, dist_init_port = dist_init_addr
|
980
|
+
port_base = int(dist_init_port) + 1
|
981
|
+
if dp_rank is None:
|
982
|
+
scheduler_input_port = (
|
983
|
+
port_base + 2
|
984
|
+
) # TokenizerManager to DataParallelController
|
985
|
+
else:
|
986
|
+
scheduler_input_port = port_base + 2 + 1 + dp_rank
|
987
|
+
|
988
|
+
return PortArgs(
|
989
|
+
tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
|
990
|
+
scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
|
991
|
+
detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
|
992
|
+
nccl_port=port,
|
993
|
+
)
|
951
994
|
|
952
995
|
|
953
996
|
class LoRAPathAction(argparse.Action):
|
sglang/srt/utils.py
CHANGED
@@ -59,6 +59,7 @@ from triton.runtime.cache import (
|
|
59
59
|
default_dump_dir,
|
60
60
|
default_override_dir,
|
61
61
|
)
|
62
|
+
from uvicorn.config import LOGGING_CONFIG
|
62
63
|
|
63
64
|
logger = logging.getLogger(__name__)
|
64
65
|
|
@@ -101,14 +102,6 @@ def is_cuda_available():
|
|
101
102
|
return torch.cuda.is_available() and torch.version.cuda
|
102
103
|
|
103
104
|
|
104
|
-
def is_ipv6(address):
|
105
|
-
try:
|
106
|
-
ipaddress.IPv6Address(address)
|
107
|
-
return True
|
108
|
-
except ipaddress.AddressValueError:
|
109
|
-
return False
|
110
|
-
|
111
|
-
|
112
105
|
def enable_show_time_cost():
|
113
106
|
global show_time_cost
|
114
107
|
show_time_cost = True
|
@@ -222,6 +215,10 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
|
|
222
215
|
|
223
216
|
free_gpu_memory, total_gpu_memory = torch.hpu.mem_get_info()
|
224
217
|
|
218
|
+
elif device == "cpu":
|
219
|
+
# TODO: rename the variables in the current function to be not GPU specific
|
220
|
+
free_gpu_memory = psutil.virtual_memory().available
|
221
|
+
|
225
222
|
if distributed:
|
226
223
|
tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
|
227
224
|
torch.device(device, gpu_id)
|
@@ -446,6 +443,8 @@ def load_image(image_file: Union[str, bytes]):
|
|
446
443
|
else:
|
447
444
|
raise ValueError(f"Invalid image: {image}")
|
448
445
|
|
446
|
+
# if image_size is None:
|
447
|
+
# image_size = image.size
|
449
448
|
return image, image_size
|
450
449
|
|
451
450
|
|
@@ -511,76 +510,32 @@ def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = N
|
|
511
510
|
pass
|
512
511
|
|
513
512
|
|
514
|
-
def
|
513
|
+
def monkey_patch_p2p_access_check():
|
515
514
|
"""
|
516
|
-
Monkey patch the slow p2p access check
|
515
|
+
Monkey patch the slow p2p access check.
|
517
516
|
NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
|
518
517
|
"""
|
519
518
|
|
520
|
-
import
|
519
|
+
import sglang.srt.distributed.device_communicators.custom_all_reduce_utils as tgt
|
521
520
|
|
522
521
|
setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
|
523
522
|
|
524
523
|
# Suppress the warnings from this delete function when using sglang.bench_one_batch
|
525
|
-
from
|
524
|
+
from sglang.srt.distributed.device_communicators.custom_all_reduce import (
|
525
|
+
CustomAllreduce,
|
526
|
+
)
|
526
527
|
|
527
528
|
setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
|
528
529
|
|
529
530
|
|
530
|
-
vllm_all_gather_backup = None
|
531
|
-
|
532
|
-
|
533
|
-
def monkey_patch_vllm_all_gather(reverse: bool = False):
|
534
|
-
"""Monkey patch all-gather to remove in-place operations."""
|
535
|
-
from torch.distributed import _functional_collectives as funcol
|
536
|
-
from vllm.distributed.parallel_state import GroupCoordinator
|
537
|
-
|
538
|
-
global vllm_all_gather_backup
|
539
|
-
if vllm_all_gather_backup is None:
|
540
|
-
vllm_all_gather_backup = GroupCoordinator.all_gather
|
541
|
-
|
542
|
-
def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
|
543
|
-
world_size = self.world_size
|
544
|
-
# Bypass the function if we are using only 1 GPU.
|
545
|
-
if world_size == 1:
|
546
|
-
return input_
|
547
|
-
assert (
|
548
|
-
-input_.dim() <= dim < input_.dim()
|
549
|
-
), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
|
550
|
-
if dim < 0:
|
551
|
-
# Convert negative dim to positive.
|
552
|
-
dim += input_.dim()
|
553
|
-
input_size = input_.size()
|
554
|
-
# Allocate output tensor.
|
555
|
-
output_tensor = torch.empty(
|
556
|
-
(world_size,) + input_size, dtype=input_.dtype, device=input_.device
|
557
|
-
)
|
558
|
-
|
559
|
-
output_tensor = funcol.all_gather_tensor(
|
560
|
-
input_, gather_dim=0, group=self.device_group
|
561
|
-
).view((world_size,) + input_size)
|
562
|
-
|
563
|
-
# Reshape
|
564
|
-
output_tensor = output_tensor.movedim(0, dim)
|
565
|
-
output_tensor = output_tensor.reshape(
|
566
|
-
input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
|
567
|
-
)
|
568
|
-
return output_tensor
|
569
|
-
|
570
|
-
if reverse:
|
571
|
-
setattr(GroupCoordinator, "all_gather", vllm_all_gather_backup)
|
572
|
-
else:
|
573
|
-
setattr(GroupCoordinator, "all_gather", all_gather)
|
574
|
-
|
575
|
-
|
576
531
|
def monkey_patch_vllm_gguf_config():
|
577
|
-
from vllm.model_executor.layers.linear import LinearBase
|
578
532
|
from vllm.model_executor.layers.quantization.gguf import (
|
579
533
|
GGUFConfig,
|
580
534
|
GGUFEmbeddingMethod,
|
581
535
|
GGUFLinearMethod,
|
582
536
|
)
|
583
537
|
|
538
|
+
from sglang.srt.layers.linear import LinearBase
|
584
539
|
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
585
540
|
|
586
541
|
def get_quant_method_with_embedding_replaced(
|
@@ -788,7 +743,9 @@ def first_rank_print(*args, **kwargs):
|
|
788
743
|
pass
|
789
744
|
|
790
745
|
|
791
|
-
def get_zmq_socket(
|
746
|
+
def get_zmq_socket(
|
747
|
+
context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
|
748
|
+
):
|
792
749
|
mem = psutil.virtual_memory()
|
793
750
|
total_mem = mem.total / 1024**3
|
794
751
|
available_mem = mem.available / 1024**3
|
@@ -801,14 +758,17 @@ def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint:
|
|
801
758
|
if socket_type == zmq.PUSH:
|
802
759
|
socket.setsockopt(zmq.SNDHWM, 0)
|
803
760
|
socket.setsockopt(zmq.SNDBUF, buf_size)
|
804
|
-
socket.connect(f"ipc://{endpoint}")
|
805
761
|
elif socket_type == zmq.PULL:
|
806
762
|
socket.setsockopt(zmq.RCVHWM, 0)
|
807
763
|
socket.setsockopt(zmq.RCVBUF, buf_size)
|
808
|
-
socket.bind(f"ipc://{endpoint}")
|
809
764
|
else:
|
810
765
|
raise ValueError(f"Unsupported socket type: {socket_type}")
|
811
766
|
|
767
|
+
if bind:
|
768
|
+
socket.bind(endpoint)
|
769
|
+
else:
|
770
|
+
socket.connect(endpoint)
|
771
|
+
|
812
772
|
return socket
|
813
773
|
|
814
774
|
|
@@ -1250,9 +1210,9 @@ def dataclass_to_string_truncated(data, max_length=2048):
|
|
1250
1210
|
if isinstance(data, str):
|
1251
1211
|
if len(data) > max_length:
|
1252
1212
|
half_length = max_length // 2
|
1253
|
-
return f
|
1213
|
+
return f"{repr(data[:half_length])} ... {repr(data[-half_length:])}"
|
1254
1214
|
else:
|
1255
|
-
return f
|
1215
|
+
return f"{repr(data)}"
|
1256
1216
|
elif isinstance(data, (list, tuple)):
|
1257
1217
|
if len(data) > max_length:
|
1258
1218
|
half_length = max_length // 2
|
@@ -1263,7 +1223,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
|
|
1263
1223
|
return (
|
1264
1224
|
"{"
|
1265
1225
|
+ ", ".join(
|
1266
|
-
f"{k}: {dataclass_to_string_truncated(v, max_length)}"
|
1226
|
+
f"'{k}': {dataclass_to_string_truncated(v, max_length)}"
|
1267
1227
|
for k, v in data.items()
|
1268
1228
|
)
|
1269
1229
|
+ "}"
|
@@ -1404,3 +1364,81 @@ def nullable_str(val: str):
|
|
1404
1364
|
if not val or val == "None":
|
1405
1365
|
return None
|
1406
1366
|
return val
|
1367
|
+
|
1368
|
+
|
1369
|
+
def set_uvicorn_logging_configs():
|
1370
|
+
LOGGING_CONFIG["formatters"]["default"][
|
1371
|
+
"fmt"
|
1372
|
+
] = "[%(asctime)s] %(levelprefix)s %(message)s"
|
1373
|
+
LOGGING_CONFIG["formatters"]["default"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
|
1374
|
+
LOGGING_CONFIG["formatters"]["access"][
|
1375
|
+
"fmt"
|
1376
|
+
] = '[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
|
1377
|
+
LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
|
1378
|
+
|
1379
|
+
|
1380
|
+
def get_ip() -> str:
|
1381
|
+
# SGLANG_HOST_IP env can be ignore
|
1382
|
+
host_ip = os.getenv("SGLANG_HOST_IP", "") or os.getenv("HOST_IP", "")
|
1383
|
+
if host_ip:
|
1384
|
+
return host_ip
|
1385
|
+
|
1386
|
+
# IP is not set, try to get it from the network interface
|
1387
|
+
|
1388
|
+
# try ipv4
|
1389
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
1390
|
+
try:
|
1391
|
+
s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable
|
1392
|
+
return s.getsockname()[0]
|
1393
|
+
except Exception:
|
1394
|
+
pass
|
1395
|
+
|
1396
|
+
# try ipv6
|
1397
|
+
try:
|
1398
|
+
s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
1399
|
+
# Google's public DNS server, see
|
1400
|
+
# https://developers.google.com/speed/public-dns/docs/using#addresses
|
1401
|
+
s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
|
1402
|
+
return s.getsockname()[0]
|
1403
|
+
except Exception:
|
1404
|
+
pass
|
1405
|
+
|
1406
|
+
warnings.warn(
|
1407
|
+
"Failed to get the IP address, using 0.0.0.0 by default."
|
1408
|
+
"The value can be set by the environment variable"
|
1409
|
+
" SGLANG_HOST_IP or HOST_IP.",
|
1410
|
+
stacklevel=2,
|
1411
|
+
)
|
1412
|
+
return "0.0.0.0"
|
1413
|
+
|
1414
|
+
|
1415
|
+
def get_open_port() -> int:
|
1416
|
+
|
1417
|
+
port = os.getenv("SGLANG_PORT")
|
1418
|
+
if port is not None:
|
1419
|
+
while True:
|
1420
|
+
try:
|
1421
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
1422
|
+
s.bind(("", port))
|
1423
|
+
return port
|
1424
|
+
except OSError:
|
1425
|
+
port += 1 # Increment port number if already in use
|
1426
|
+
logger.info("Port %d is already in use, trying port %d", port - 1, port)
|
1427
|
+
# try ipv4
|
1428
|
+
try:
|
1429
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
1430
|
+
s.bind(("", 0))
|
1431
|
+
return s.getsockname()[1]
|
1432
|
+
except OSError:
|
1433
|
+
# try ipv6
|
1434
|
+
with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
|
1435
|
+
s.bind(("", 0))
|
1436
|
+
return s.getsockname()[1]
|
1437
|
+
|
1438
|
+
|
1439
|
+
def is_valid_ipv6_address(address: str) -> bool:
|
1440
|
+
try:
|
1441
|
+
ipaddress.IPv6Address(address)
|
1442
|
+
return True
|
1443
|
+
except ValueError:
|
1444
|
+
return False
|
sglang/test/runners.py
CHANGED
@@ -12,7 +12,6 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
|
15
|
-
import json
|
16
15
|
import multiprocessing as mp
|
17
16
|
import os
|
18
17
|
from dataclasses import dataclass
|
@@ -22,8 +21,8 @@ import torch
|
|
22
21
|
import torch.nn.functional as F
|
23
22
|
from transformers import AutoModelForCausalLM
|
24
23
|
|
24
|
+
from sglang.srt.entrypoints.engine import Engine
|
25
25
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
26
|
-
from sglang.srt.server import Runtime
|
27
26
|
from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
|
28
27
|
|
29
28
|
DEFAULT_PROMPTS = [
|
@@ -278,7 +277,7 @@ class SRTRunner:
|
|
278
277
|
):
|
279
278
|
self.model_type = model_type
|
280
279
|
self.is_generation = model_type == "generation"
|
281
|
-
self.
|
280
|
+
self.engine = Engine(
|
282
281
|
model_path=model_path,
|
283
282
|
tp_size=tp_size,
|
284
283
|
dtype=get_dtype_str(torch_dtype),
|
@@ -306,7 +305,7 @@ class SRTRunner:
|
|
306
305
|
top_output_logprobs = []
|
307
306
|
sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
|
308
307
|
for i, prompt in enumerate(prompts):
|
309
|
-
response = self.
|
308
|
+
response = self.engine.generate(
|
310
309
|
prompt,
|
311
310
|
lora_path=lora_paths[i] if lora_paths else None,
|
312
311
|
sampling_params=sampling_params,
|
@@ -314,7 +313,6 @@ class SRTRunner:
|
|
314
313
|
logprob_start_len=0,
|
315
314
|
top_logprobs_num=NUM_TOP_LOGPROBS,
|
316
315
|
)
|
317
|
-
response = json.loads(response)
|
318
316
|
output_strs.append(response["text"])
|
319
317
|
top_input_logprobs.append(
|
320
318
|
[
|
@@ -343,8 +341,7 @@ class SRTRunner:
|
|
343
341
|
top_output_logprobs=top_output_logprobs,
|
344
342
|
)
|
345
343
|
else:
|
346
|
-
response = self.
|
347
|
-
response = json.loads(response)
|
344
|
+
response = self.engine.encode(prompts)
|
348
345
|
if self.model_type == "embedding":
|
349
346
|
logits = [x["embedding"] for x in response]
|
350
347
|
return ModelOutput(embed_logits=logits)
|
@@ -366,20 +363,18 @@ class SRTRunner:
|
|
366
363
|
# the return value contains logprobs from prefill
|
367
364
|
output_strs = []
|
368
365
|
sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
|
369
|
-
response = self.
|
366
|
+
response = self.engine.generate(
|
370
367
|
prompts,
|
371
368
|
lora_path=lora_paths if lora_paths else None,
|
372
369
|
sampling_params=sampling_params,
|
373
370
|
)
|
374
|
-
response = json.loads(response)
|
375
371
|
output_strs = [r["text"] for r in response]
|
376
372
|
|
377
373
|
return ModelOutput(
|
378
374
|
output_strs=output_strs,
|
379
375
|
)
|
380
376
|
else:
|
381
|
-
response = self.
|
382
|
-
response = json.loads(response)
|
377
|
+
response = self.engine.encode(prompts)
|
383
378
|
if self.model_type == "embedding":
|
384
379
|
logits = [x["embedding"] for x in response]
|
385
380
|
return ModelOutput(embed_logits=logits)
|
@@ -391,8 +386,8 @@ class SRTRunner:
|
|
391
386
|
return self
|
392
387
|
|
393
388
|
def __exit__(self, exc_type, exc_value, traceback):
|
394
|
-
self.
|
395
|
-
del self.
|
389
|
+
self.engine.shutdown()
|
390
|
+
del self.engine
|
396
391
|
|
397
392
|
|
398
393
|
def monkey_patch_gemma2_sdpa():
|
sglang/test/test_programs.py
CHANGED
@@ -535,7 +535,7 @@ def test_hellaswag_select():
|
|
535
535
|
|
536
536
|
# Compute accuracy
|
537
537
|
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
|
538
|
-
assert np.abs(accuracy_gen - accuracy) < 0.
|
538
|
+
assert np.abs(accuracy_gen - accuracy) < 0.05
|
539
539
|
assert np.abs(latency_gen - latency) < 1
|
540
540
|
|
541
541
|
return accuracy, latency
|
sglang/test/test_utils.py
CHANGED
@@ -40,6 +40,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mis
|
|
40
40
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
41
41
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
42
42
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
43
|
+
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
43
44
|
|
44
45
|
|
45
46
|
def is_in_ci():
|
@@ -405,7 +406,7 @@ def popen_launch_server(
|
|
405
406
|
base_url: str,
|
406
407
|
timeout: float,
|
407
408
|
api_key: Optional[str] = None,
|
408
|
-
other_args:
|
409
|
+
other_args: list[str] = (),
|
409
410
|
env: Optional[dict] = None,
|
410
411
|
return_stdout_stderr: Optional[tuple] = None,
|
411
412
|
):
|
@@ -560,6 +561,7 @@ def run_bench_serving(
|
|
560
561
|
tokenizer=tokenizer,
|
561
562
|
num_prompts=num_prompts,
|
562
563
|
sharegpt_output_len=None,
|
564
|
+
sharegpt_context_len=None,
|
563
565
|
random_input_len=random_input_len,
|
564
566
|
random_output_len=random_output_len,
|
565
567
|
random_range_ratio=0.0,
|
sglang/utils.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
"""Common utilities"""
|
2
2
|
|
3
3
|
import base64
|
4
|
-
import gc
|
5
4
|
import importlib
|
6
5
|
import json
|
7
6
|
import logging
|
@@ -15,7 +14,7 @@ import urllib.request
|
|
15
14
|
from concurrent.futures import ThreadPoolExecutor
|
16
15
|
from io import BytesIO
|
17
16
|
from json import dumps
|
18
|
-
from typing import Optional, Union
|
17
|
+
from typing import Any, Callable, List, Optional, Tuple, Type, Union
|
19
18
|
|
20
19
|
import numpy as np
|
21
20
|
import requests
|
@@ -363,3 +362,14 @@ def terminate_process(process):
|
|
363
362
|
def print_highlight(html_content: str):
|
364
363
|
html_content = str(html_content).replace("\n", "<br>")
|
365
364
|
display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
|
365
|
+
|
366
|
+
|
367
|
+
class TypeBasedDispatcher:
|
368
|
+
def __init__(self, mapping: List[Tuple[Type, Callable]]):
|
369
|
+
self._mapping = mapping
|
370
|
+
|
371
|
+
def __call__(self, obj: Any):
|
372
|
+
for ty, fn in self._mapping:
|
373
|
+
if isinstance(obj, ty):
|
374
|
+
return fn(obj)
|
375
|
+
raise ValueError(f"Invalid object: {obj}")
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.1.
|
1
|
+
__version__ = "0.4.1.post7"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.1.
|
3
|
+
Version: 0.4.1.post7
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -236,13 +236,13 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
236
236
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
237
237
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
238
238
|
Requires-Dist: uvloop; extra == "runtime-common"
|
239
|
-
Requires-Dist: xgrammar>=0.1.
|
239
|
+
Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
|
240
240
|
Provides-Extra: srt
|
241
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
242
|
Requires-Dist: cuda-python; extra == "srt"
|
243
|
-
Requires-Dist: sgl-kernel>=0.0.2.
|
243
|
+
Requires-Dist: sgl-kernel>=0.0.2.post14; extra == "srt"
|
244
244
|
Requires-Dist: torch; extra == "srt"
|
245
|
-
Requires-Dist: vllm
|
245
|
+
Requires-Dist: vllm==0.6.4.post1; extra == "srt"
|
246
246
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
@@ -252,6 +252,9 @@ Provides-Extra: srt-xpu
|
|
252
252
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
253
253
|
Provides-Extra: srt-hpu
|
254
254
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
255
|
+
Provides-Extra: srt-cpu
|
256
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
257
|
+
Requires-Dist: torch; extra == "srt-cpu"
|
255
258
|
Provides-Extra: openai
|
256
259
|
Requires-Dist: openai>=1.0; extra == "openai"
|
257
260
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -288,6 +291,11 @@ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
|
|
288
291
|
Requires-Dist: sglang[openai]; extra == "all-hpu"
|
289
292
|
Requires-Dist: sglang[anthropic]; extra == "all-hpu"
|
290
293
|
Requires-Dist: sglang[litellm]; extra == "all-hpu"
|
294
|
+
Provides-Extra: all-cpu
|
295
|
+
Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
|
296
|
+
Requires-Dist: sglang[openai]; extra == "all-cpu"
|
297
|
+
Requires-Dist: sglang[anthropic]; extra == "all-cpu"
|
298
|
+
Requires-Dist: sglang[litellm]; extra == "all-cpu"
|
291
299
|
Provides-Extra: dev
|
292
300
|
Requires-Dist: sglang[all]; extra == "dev"
|
293
301
|
Requires-Dist: sglang[test]; extra == "dev"
|
@@ -300,6 +308,9 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
|
|
300
308
|
Provides-Extra: dev-hpu
|
301
309
|
Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
|
302
310
|
Requires-Dist: sglang[test]; extra == "dev-hpu"
|
311
|
+
Provides-Extra: dev-cpu
|
312
|
+
Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
|
313
|
+
Requires-Dist: sglang[test]; extra == "dev-cpu"
|
303
314
|
|
304
315
|
<div align="center" id="sglangtop">
|
305
316
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -361,7 +372,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
361
372
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
362
373
|
|
363
374
|
## Adoption and Sponsorship
|
364
|
-
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
375
|
+
The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
365
376
|
|
366
377
|
## Acknowledgment and Citation
|
367
378
|
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|