sglang 0.4.2.post4__py3-none-any.whl → 0.4.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/global_config.py +2 -0
- sglang/lang/backend/openai.py +5 -0
- sglang/lang/chat_template.py +22 -7
- sglang/lang/ir.py +1 -0
- sglang/srt/configs/__init__.py +6 -3
- sglang/srt/configs/model_config.py +2 -0
- sglang/srt/configs/qwen2_5_vl_config.py +1003 -0
- sglang/srt/entrypoints/engine.py +18 -3
- sglang/srt/hf_transformers_utils.py +2 -3
- sglang/srt/layers/attention/flashinfer_backend.py +235 -110
- sglang/srt/layers/attention/triton_backend.py +358 -72
- sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
- sglang/srt/layers/linear.py +12 -5
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +2 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +2 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +178 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +175 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +2 -0
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/__init__.py +51 -5
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +30 -30
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +29 -29
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +33 -33
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +31 -31
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +27 -27
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +31 -31
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +24 -24
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +30 -30
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +42 -42
- sglang/srt/layers/quantization/fp8_kernel.py +123 -17
- sglang/srt/layers/quantization/fp8_utils.py +33 -4
- sglang/srt/managers/detokenizer_manager.py +1 -0
- sglang/srt/managers/image_processor.py +217 -122
- sglang/srt/managers/io_struct.py +4 -0
- sglang/srt/managers/schedule_batch.py +16 -3
- sglang/srt/managers/scheduler.py +29 -0
- sglang/srt/managers/tokenizer_manager.py +6 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +4 -0
- sglang/srt/model_executor/cuda_graph_runner.py +12 -1
- sglang/srt/model_executor/forward_batch_info.py +4 -1
- sglang/srt/model_executor/model_runner.py +12 -2
- sglang/srt/models/deepseek_nextn.py +295 -0
- sglang/srt/models/deepseek_v2.py +21 -8
- sglang/srt/models/llava.py +2 -1
- sglang/srt/models/qwen2_5_vl.py +722 -0
- sglang/srt/models/qwen2_vl.py +2 -1
- sglang/srt/openai_api/adapter.py +17 -3
- sglang/srt/server_args.py +26 -4
- sglang/srt/speculative/eagle_worker.py +35 -10
- sglang/srt/speculative/spec_info.py +11 -1
- sglang/srt/utils.py +7 -0
- sglang/utils.py +99 -19
- sglang/version.py +1 -1
- {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/METADATA +5 -4
- {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/RECORD +73 -55
- sglang/srt/configs/qwen2vl.py +0 -130
- {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/LICENSE +0 -0
- {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.2.post4.dist-info → sglang-0.4.3.post1.dist-info}/top_level.txt +0 -0
sglang/srt/models/qwen2_vl.py
CHANGED
@@ -31,8 +31,9 @@ import torch
|
|
31
31
|
import torch.nn as nn
|
32
32
|
import torch.nn.functional as F
|
33
33
|
from einops import rearrange
|
34
|
+
from transformers import Qwen2VLConfig
|
35
|
+
from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
|
34
36
|
|
35
|
-
from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
|
36
37
|
from sglang.srt.hf_transformers_utils import get_processor
|
37
38
|
from sglang.srt.layers.activation import QuickGELU
|
38
39
|
from sglang.srt.layers.attention.vision import VisionAttention
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -20,12 +20,14 @@ import os
|
|
20
20
|
import time
|
21
21
|
import uuid
|
22
22
|
from http import HTTPStatus
|
23
|
-
from typing import Dict, List
|
23
|
+
from typing import Dict, List
|
24
24
|
|
25
25
|
from fastapi import HTTPException, Request, UploadFile
|
26
26
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
27
27
|
from pydantic import ValidationError
|
28
28
|
|
29
|
+
from sglang.lang.chat_template import get_chat_template_by_model_path
|
30
|
+
|
29
31
|
try:
|
30
32
|
from outlines.fsm.json_schema import convert_json_schema_to_str
|
31
33
|
except ImportError:
|
@@ -92,7 +94,6 @@ file_id_response: Dict[str, FileResponse] = {}
|
|
92
94
|
# map file id to file path in SGLang backend
|
93
95
|
file_id_storage: Dict[str, str] = {}
|
94
96
|
|
95
|
-
|
96
97
|
# backend storage directory
|
97
98
|
storage_dir = None
|
98
99
|
|
@@ -116,12 +117,13 @@ def create_streaming_error_response(
|
|
116
117
|
return json_str
|
117
118
|
|
118
119
|
|
119
|
-
def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
|
120
|
+
def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, model_path):
|
120
121
|
global chat_template_name
|
121
122
|
|
122
123
|
logger.info(
|
123
124
|
f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}"
|
124
125
|
)
|
126
|
+
|
125
127
|
if not chat_template_exists(chat_template_arg):
|
126
128
|
if not os.path.exists(chat_template_arg):
|
127
129
|
raise RuntimeError(
|
@@ -163,6 +165,18 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
|
|
163
165
|
else:
|
164
166
|
chat_template_name = chat_template_arg
|
165
167
|
|
168
|
+
# check chat-template
|
169
|
+
chat_template = get_chat_template_by_model_path(model_path)
|
170
|
+
if chat_template is not None:
|
171
|
+
official_chat_template = chat_template.name
|
172
|
+
used_chat_template = chat_template_name
|
173
|
+
if official_chat_template != used_chat_template:
|
174
|
+
logger.warning(
|
175
|
+
f"Using a chat_template: '{used_chat_template}', "
|
176
|
+
f"which is different from official chat template: '{official_chat_template}', "
|
177
|
+
f"This discrepancy may lead to performance degradation."
|
178
|
+
)
|
179
|
+
|
166
180
|
|
167
181
|
async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None):
|
168
182
|
try:
|
sglang/srt/server_args.py
CHANGED
@@ -140,6 +140,7 @@ class ServerArgs:
|
|
140
140
|
disable_jump_forward: bool = False
|
141
141
|
disable_cuda_graph: bool = False
|
142
142
|
disable_cuda_graph_padding: bool = False
|
143
|
+
enable_nccl_nvls: bool = False
|
143
144
|
disable_outlines_disk_cache: bool = False
|
144
145
|
disable_custom_all_reduce: bool = False
|
145
146
|
disable_mla: bool = False
|
@@ -160,12 +161,15 @@ class ServerArgs:
|
|
160
161
|
delete_ckpt_after_loading: bool = False
|
161
162
|
enable_memory_saver: bool = False
|
162
163
|
allow_auto_truncate: bool = False
|
164
|
+
return_hidden_states: bool = False
|
163
165
|
|
164
166
|
# Custom logit processor
|
165
167
|
enable_custom_logit_processor: bool = False
|
166
168
|
tool_call_parser: str = None
|
167
169
|
enable_hierarchical_cache: bool = False
|
168
170
|
|
171
|
+
enable_flashinfer_mla: bool = False
|
172
|
+
|
169
173
|
def __post_init__(self):
|
170
174
|
# Set missing default values
|
171
175
|
if self.tokenizer_path is None:
|
@@ -258,14 +262,17 @@ class ServerArgs:
|
|
258
262
|
)
|
259
263
|
|
260
264
|
# Speculative Decoding
|
261
|
-
if
|
265
|
+
if (
|
266
|
+
self.speculative_algorithm == "EAGLE"
|
267
|
+
or self.speculative_algorithm == "NEXTN"
|
268
|
+
):
|
262
269
|
self.prefill_only_one_req = True
|
263
270
|
self.disable_cuda_graph_padding = True
|
264
271
|
self.disable_radix_cache = True
|
265
272
|
self.disable_overlap_schedule = True
|
266
273
|
self.chunked_prefill_size = -1
|
267
274
|
logger.info(
|
268
|
-
"The radix cache, chunked prefill, and overlap scheduler are disabled because of using
|
275
|
+
f"The radix cache, chunked prefill, and overlap scheduler are disabled because of using {self.speculative_algorithm} speculative decoding."
|
269
276
|
)
|
270
277
|
|
271
278
|
# GGUF
|
@@ -691,12 +698,17 @@ class ServerArgs:
|
|
691
698
|
default=ServerArgs.grammar_backend,
|
692
699
|
help="Choose the backend for grammar-guided decoding.",
|
693
700
|
)
|
701
|
+
parser.add_argument(
|
702
|
+
"--enable-flashinfer-mla",
|
703
|
+
action="store_true",
|
704
|
+
help="Enable FlashInfer MLA optimization",
|
705
|
+
)
|
694
706
|
|
695
707
|
# Speculative decoding
|
696
708
|
parser.add_argument(
|
697
709
|
"--speculative-algorithm",
|
698
710
|
type=str,
|
699
|
-
choices=["EAGLE"],
|
711
|
+
choices=["EAGLE", "NEXTN"],
|
700
712
|
help="Speculative algorithm.",
|
701
713
|
)
|
702
714
|
parser.add_argument(
|
@@ -782,6 +794,11 @@ class ServerArgs:
|
|
782
794
|
action="store_true",
|
783
795
|
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
|
784
796
|
)
|
797
|
+
parser.add_argument(
|
798
|
+
"--enable-nccl-nvls",
|
799
|
+
action="store_true",
|
800
|
+
help="Enable NCCL NVLS for prefill heavy requests when available.",
|
801
|
+
)
|
785
802
|
parser.add_argument(
|
786
803
|
"--disable-outlines-disk-cache",
|
787
804
|
action="store_true",
|
@@ -795,7 +812,7 @@ class ServerArgs:
|
|
795
812
|
parser.add_argument(
|
796
813
|
"--disable-mla",
|
797
814
|
action="store_true",
|
798
|
-
help="Disable Multi-head Latent Attention (MLA) for DeepSeek
|
815
|
+
help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
|
799
816
|
)
|
800
817
|
parser.add_argument(
|
801
818
|
"--disable-overlap-schedule",
|
@@ -896,6 +913,11 @@ class ServerArgs:
|
|
896
913
|
action="store_true",
|
897
914
|
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
898
915
|
)
|
916
|
+
parser.add_argument(
|
917
|
+
"--return-hidden-states",
|
918
|
+
action="store_true",
|
919
|
+
help="Return hidden states in the response.",
|
920
|
+
)
|
899
921
|
# Function Calling
|
900
922
|
parser.add_argument(
|
901
923
|
"--tool-call-parser",
|
@@ -24,6 +24,7 @@ from sglang.srt.speculative.eagle_utils import (
|
|
24
24
|
fast_topk,
|
25
25
|
select_top_k_tokens,
|
26
26
|
)
|
27
|
+
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
27
28
|
|
28
29
|
logger = logging.getLogger(__name__)
|
29
30
|
|
@@ -57,23 +58,43 @@ class EAGLEWorker(TpModelWorker):
|
|
57
58
|
# Parse arguments
|
58
59
|
self.topk = server_args.speculative_eagle_topk
|
59
60
|
self.speculative_num_steps = server_args.speculative_num_steps
|
61
|
+
self.speculative_algorithm = SpeculativeAlgorithm.from_string(
|
62
|
+
server_args.speculative_algorithm
|
63
|
+
)
|
60
64
|
self.server_args = server_args
|
61
65
|
|
62
66
|
# Share the embedding and lm_head
|
63
|
-
|
64
|
-
|
67
|
+
if not self.speculative_algorithm.is_nextn():
|
68
|
+
embed, head = self.target_worker.model_runner.model.get_embed_and_head()
|
69
|
+
self.model_runner.model.set_embed_and_head(embed, head)
|
65
70
|
self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
|
66
71
|
|
67
72
|
# Create multi-step attn backends and cuda graph runners
|
68
|
-
|
69
|
-
|
70
|
-
|
73
|
+
if server_args.attention_backend == "flashinfer":
|
74
|
+
from sglang.srt.layers.attention.flashinfer_backend import (
|
75
|
+
FlashInferMultiStepDraftBackend,
|
76
|
+
)
|
77
|
+
|
78
|
+
self.draft_attn_backend = FlashInferMultiStepDraftBackend(
|
79
|
+
self.model_runner,
|
80
|
+
self.topk,
|
81
|
+
self.speculative_num_steps,
|
82
|
+
)
|
83
|
+
elif server_args.attention_backend == "triton":
|
84
|
+
from sglang.srt.layers.attention.triton_backend import (
|
85
|
+
TritonMultiStepDraftBackend,
|
86
|
+
)
|
87
|
+
|
88
|
+
self.draft_attn_backend = TritonMultiStepDraftBackend(
|
89
|
+
self.model_runner,
|
90
|
+
self.topk,
|
91
|
+
self.speculative_num_steps,
|
92
|
+
)
|
93
|
+
else:
|
94
|
+
raise ValueError(
|
95
|
+
f"EAGLE is not supportted in attention backend {server_args.attention_backend}"
|
96
|
+
)
|
71
97
|
|
72
|
-
self.draft_attn_backend = FlashInferMultiStepDraftBackend(
|
73
|
-
self.model_runner,
|
74
|
-
self.topk,
|
75
|
-
self.speculative_num_steps,
|
76
|
-
)
|
77
98
|
self.model_runner.draft_attn_backend = self.draft_attn_backend
|
78
99
|
self.init_cuda_graphs()
|
79
100
|
|
@@ -218,6 +239,10 @@ class EAGLEWorker(TpModelWorker):
|
|
218
239
|
token_list.append(tree_info[1])
|
219
240
|
parents_list.append(tree_info[2])
|
220
241
|
|
242
|
+
# we don't need to run the last forward. we get 1 token from draft prefill and (#spec steps - 1) tokens here
|
243
|
+
if i == self.speculative_num_steps - 1:
|
244
|
+
break
|
245
|
+
|
221
246
|
# Set inputs
|
222
247
|
forward_batch.input_ids = input_ids
|
223
248
|
forward_batch.out_cache_loc = out_cache_loc[
|
@@ -5,18 +5,28 @@ class SpeculativeAlgorithm(IntEnum):
|
|
5
5
|
NONE = auto()
|
6
6
|
EAGLE = auto()
|
7
7
|
|
8
|
+
# NEXTN spec decoding is for DeepSeek V3/R1
|
9
|
+
# currently it's implemented based on EAGLE
|
10
|
+
NEXTN = auto()
|
11
|
+
|
8
12
|
def is_none(self):
|
9
13
|
return self == SpeculativeAlgorithm.NONE
|
10
14
|
|
11
15
|
def is_eagle(self):
|
12
|
-
return self == SpeculativeAlgorithm.EAGLE
|
16
|
+
return self == SpeculativeAlgorithm.EAGLE or self == SpeculativeAlgorithm.NEXTN
|
17
|
+
|
18
|
+
def is_nextn(self):
|
19
|
+
return self == SpeculativeAlgorithm.NEXTN
|
13
20
|
|
14
21
|
@staticmethod
|
15
22
|
def from_string(name: str):
|
16
23
|
name_map = {
|
17
24
|
"EAGLE": SpeculativeAlgorithm.EAGLE,
|
25
|
+
"NEXTN": SpeculativeAlgorithm.NEXTN,
|
18
26
|
None: SpeculativeAlgorithm.NONE,
|
19
27
|
}
|
28
|
+
if name is not None:
|
29
|
+
name = name.upper()
|
20
30
|
return name_map[name]
|
21
31
|
|
22
32
|
|
sglang/srt/utils.py
CHANGED
@@ -1444,3 +1444,10 @@ def launch_dummy_health_check_server(host, port):
|
|
1444
1444
|
timeout_keep_alive=5,
|
1445
1445
|
loop="uvloop",
|
1446
1446
|
)
|
1447
|
+
|
1448
|
+
|
1449
|
+
def set_cuda_arch():
|
1450
|
+
if is_flashinfer_available():
|
1451
|
+
capability = torch.cuda.get_device_capability()
|
1452
|
+
arch = f"{capability[0]}.{capability[1]}"
|
1453
|
+
os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
|
sglang/utils.py
CHANGED
@@ -306,22 +306,112 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
|
|
306
306
|
return filename
|
307
307
|
|
308
308
|
|
309
|
-
|
309
|
+
import fcntl
|
310
|
+
|
311
|
+
|
312
|
+
def is_in_ci():
|
313
|
+
from sglang.test.test_utils import is_in_ci
|
314
|
+
|
315
|
+
return is_in_ci()
|
316
|
+
|
317
|
+
|
318
|
+
LOCKFILE = os.path.expanduser("~/.sglang_port_lock")
|
319
|
+
PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json")
|
320
|
+
|
321
|
+
if not os.path.exists(LOCKFILE):
|
322
|
+
with open(LOCKFILE, "w") as f:
|
323
|
+
pass
|
324
|
+
|
325
|
+
if not os.path.exists(PORT_REGISTRY):
|
326
|
+
with open(PORT_REGISTRY, "w") as f:
|
327
|
+
json.dump([], f)
|
328
|
+
|
329
|
+
|
330
|
+
def print_highlight(html_content: str):
|
331
|
+
if is_in_ci():
|
332
|
+
html_content = str(html_content).replace("\n", "<br>")
|
333
|
+
display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
|
334
|
+
else:
|
335
|
+
print(html_content)
|
336
|
+
|
337
|
+
|
338
|
+
def init_port_registry():
|
339
|
+
"""Initialize the port registry file if it doesn't exist."""
|
340
|
+
if not os.path.exists(PORT_REGISTRY):
|
341
|
+
with open(PORT_REGISTRY, "w") as f:
|
342
|
+
json.dump([], f)
|
343
|
+
|
344
|
+
|
345
|
+
def reserve_port(start=30000, end=40000):
|
346
|
+
"""
|
347
|
+
Reserve an available port using a file lock and a registry.
|
348
|
+
Returns the allocated port.
|
310
349
|
"""
|
311
|
-
|
350
|
+
init_port_registry()
|
351
|
+
with open(LOCKFILE, "w") as lock:
|
352
|
+
fcntl.flock(lock, fcntl.LOCK_EX)
|
353
|
+
try:
|
354
|
+
with open(PORT_REGISTRY, "r") as f:
|
355
|
+
used = json.load(f)
|
356
|
+
except Exception:
|
357
|
+
used = []
|
358
|
+
for port in range(start, end):
|
359
|
+
if port not in used:
|
360
|
+
used.append(port)
|
361
|
+
with open(PORT_REGISTRY, "w") as f:
|
362
|
+
json.dump(used, f)
|
363
|
+
return port
|
364
|
+
raise RuntimeError("No free port available")
|
365
|
+
|
366
|
+
|
367
|
+
def release_port(port):
|
368
|
+
"""Release the reserved port by removing it from the registry."""
|
369
|
+
with open(LOCKFILE, "w") as lock:
|
370
|
+
fcntl.flock(lock, fcntl.LOCK_EX)
|
371
|
+
try:
|
372
|
+
with open(PORT_REGISTRY, "r") as f:
|
373
|
+
used = json.load(f)
|
374
|
+
except Exception:
|
375
|
+
used = []
|
376
|
+
if port in used:
|
377
|
+
used.remove(port)
|
378
|
+
with open(PORT_REGISTRY, "w") as f:
|
379
|
+
json.dump(used, f)
|
312
380
|
|
313
|
-
|
314
|
-
|
315
|
-
Returns:
|
316
|
-
subprocess.Popen: Process handle
|
381
|
+
|
382
|
+
def execute_shell_command(command: str) -> subprocess.Popen:
|
317
383
|
"""
|
318
|
-
|
384
|
+
Execute a shell command and return its process handle.
|
385
|
+
"""
|
386
|
+
# Replace newline continuations and split the command string.
|
319
387
|
command = command.replace("\\\n", " ").replace("\\", " ")
|
320
388
|
parts = command.split()
|
321
|
-
|
322
389
|
return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
|
323
390
|
|
324
391
|
|
392
|
+
def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
|
393
|
+
"""
|
394
|
+
Launch the server using the given command.
|
395
|
+
If no port is specified, a free port is reserved.
|
396
|
+
"""
|
397
|
+
if port is None:
|
398
|
+
port = reserve_port()
|
399
|
+
full_command = f"{command} --port {port}"
|
400
|
+
process = execute_shell_command(full_command)
|
401
|
+
return process, port
|
402
|
+
|
403
|
+
|
404
|
+
def terminate_process(process, port=None):
|
405
|
+
"""
|
406
|
+
Terminate the process and, if a port was reserved, release it.
|
407
|
+
"""
|
408
|
+
from sglang.srt.utils import kill_process_tree
|
409
|
+
|
410
|
+
kill_process_tree(process.pid)
|
411
|
+
if port is not None:
|
412
|
+
release_port(port)
|
413
|
+
|
414
|
+
|
325
415
|
def wait_for_server(base_url: str, timeout: int = None) -> None:
|
326
416
|
"""Wait for the server to be ready by polling the /v1/models endpoint.
|
327
417
|
|
@@ -343,6 +433,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
|
343
433
|
NOTE: Typically, the server runs in a separate terminal.
|
344
434
|
In this notebook, we run the server and notebook code together, so their outputs are combined.
|
345
435
|
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
|
436
|
+
We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
|
346
437
|
"""
|
347
438
|
)
|
348
439
|
break
|
@@ -353,17 +444,6 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
|
353
444
|
time.sleep(1)
|
354
445
|
|
355
446
|
|
356
|
-
def terminate_process(process):
|
357
|
-
from sglang.srt.utils import kill_process_tree
|
358
|
-
|
359
|
-
kill_process_tree(process.pid)
|
360
|
-
|
361
|
-
|
362
|
-
def print_highlight(html_content: str):
|
363
|
-
html_content = str(html_content).replace("\n", "<br>")
|
364
|
-
display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
|
365
|
-
|
366
|
-
|
367
447
|
class TypeBasedDispatcher:
|
368
448
|
def __init__(self, mapping: List[Tuple[Type, Callable]]):
|
369
449
|
self._mapping = mapping
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.3.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.3.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -235,14 +235,15 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
235
235
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
236
236
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
237
237
|
Requires-Dist: uvloop; extra == "runtime-common"
|
238
|
-
Requires-Dist: xgrammar
|
238
|
+
Requires-Dist: xgrammar==0.1.10; extra == "runtime-common"
|
239
|
+
Requires-Dist: ninja; extra == "runtime-common"
|
239
240
|
Provides-Extra: srt
|
240
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
241
242
|
Requires-Dist: cuda-python; extra == "srt"
|
242
|
-
Requires-Dist: sgl-kernel>=0.0.3.
|
243
|
+
Requires-Dist: sgl-kernel>=0.0.3.post6; extra == "srt"
|
243
244
|
Requires-Dist: torch; extra == "srt"
|
244
245
|
Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
|
245
|
-
Requires-Dist: flashinfer_python>=0.2.
|
246
|
+
Requires-Dist: flashinfer_python>=0.2.1.post1; extra == "srt"
|
246
247
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
247
248
|
Provides-Extra: srt-hip
|
248
249
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|