sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -11
- sglang/bench_serving.py +149 -1
- sglang/check_env.py +3 -3
- sglang/lang/chat_template.py +44 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseekvl2.py +3 -0
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/internvl.py +696 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/kimi_vl.py +38 -0
- sglang/srt/configs/kimi_vl_moonvit.py +32 -0
- sglang/srt/configs/model_config.py +32 -0
- sglang/srt/constrained/xgrammar_backend.py +11 -19
- sglang/srt/conversation.py +151 -3
- sglang/srt/disaggregation/decode.py +4 -1
- sglang/srt/disaggregation/mini_lb.py +74 -23
- sglang/srt/disaggregation/mooncake/conn.py +9 -18
- sglang/srt/disaggregation/nixl/conn.py +241 -71
- sglang/srt/disaggregation/utils.py +44 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
- sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
- sglang/srt/distributed/device_communicators/pynccl.py +2 -1
- sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
- sglang/srt/distributed/parallel_state.py +22 -1
- sglang/srt/entrypoints/engine.py +58 -24
- sglang/srt/entrypoints/http_server.py +28 -1
- sglang/srt/entrypoints/verl_engine.py +3 -2
- sglang/srt/function_call_parser.py +97 -0
- sglang/srt/hf_transformers_utils.py +22 -1
- sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +146 -50
- sglang/srt/layers/attention/flashinfer_backend.py +129 -94
- sglang/srt/layers/attention/flashinfer_mla_backend.py +88 -30
- sglang/srt/layers/attention/flashmla_backend.py +3 -0
- sglang/srt/layers/attention/merge_state.py +46 -0
- sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
- sglang/srt/layers/attention/vision.py +290 -163
- sglang/srt/layers/dp_attention.py +5 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
- sglang/srt/layers/moe/ep_moe/layer.py +120 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +98 -57
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -5
- sglang/srt/layers/quantization/__init__.py +2 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
- sglang/srt/layers/quantization/deep_gemm.py +6 -1
- sglang/srt/layers/quantization/fp8.py +108 -95
- sglang/srt/layers/quantization/fp8_kernel.py +79 -60
- sglang/srt/layers/quantization/fp8_utils.py +71 -23
- sglang/srt/layers/quantization/kv_cache.py +3 -10
- sglang/srt/layers/quantization/utils.py +0 -5
- sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
- sglang/srt/layers/utils.py +35 -0
- sglang/srt/lora/layers.py +35 -9
- sglang/srt/lora/lora_manager.py +81 -35
- sglang/srt/managers/cache_controller.py +115 -119
- sglang/srt/managers/data_parallel_controller.py +52 -34
- sglang/srt/managers/io_struct.py +10 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
- sglang/srt/managers/multimodal_processors/internvl.py +232 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
- sglang/srt/managers/schedule_batch.py +44 -16
- sglang/srt/managers/schedule_policy.py +11 -5
- sglang/srt/managers/scheduler.py +291 -72
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +24 -13
- sglang/srt/managers/tp_worker.py +60 -28
- sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
- sglang/srt/mem_cache/chunk_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +70 -36
- sglang/srt/model_executor/cuda_graph_runner.py +82 -19
- sglang/srt/model_executor/forward_batch_info.py +31 -1
- sglang/srt/model_executor/model_runner.py +159 -90
- sglang/srt/model_loader/loader.py +18 -11
- sglang/srt/models/clip.py +4 -4
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_nextn.py +2 -277
- sglang/srt/models/deepseek_v2.py +132 -37
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/internlm2.py +3 -0
- sglang/srt/models/internvl.py +670 -0
- sglang/srt/models/kimi_vl.py +308 -0
- sglang/srt/models/kimi_vl_moonvit.py +639 -0
- sglang/srt/models/llama.py +93 -31
- sglang/srt/models/llama4.py +54 -7
- sglang/srt/models/llama_eagle.py +4 -1
- sglang/srt/models/llama_eagle3.py +4 -1
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/phi3_small.py +16 -2
- sglang/srt/models/qwen2_5_vl.py +8 -4
- sglang/srt/models/qwen2_moe.py +8 -3
- sglang/srt/models/qwen2_vl.py +4 -16
- sglang/srt/models/qwen3_moe.py +8 -3
- sglang/srt/models/xiaomi_mimo.py +171 -0
- sglang/srt/openai_api/adapter.py +58 -62
- sglang/srt/openai_api/protocol.py +38 -16
- sglang/srt/reasoning_parser.py +2 -2
- sglang/srt/sampling/sampling_batch_info.py +54 -2
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +93 -24
- sglang/srt/speculative/eagle_worker.py +3 -2
- sglang/srt/utils.py +123 -10
- sglang/test/runners.py +4 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deepep_utils.py +219 -0
- sglang/test/test_utils.py +32 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +18 -9
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +119 -99
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
|
|
1
|
+
# Copy from deepseek-ai/DeepEP/tests/test_utils.py
|
2
|
+
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import torch
|
9
|
+
import torch.distributed as dist
|
10
|
+
|
11
|
+
|
12
|
+
def init_dist(local_rank: int, num_local_ranks: int):
|
13
|
+
# NOTES: you may rewrite this function with your own cluster settings
|
14
|
+
ip = os.getenv("MASTER_ADDR", "127.0.0.1")
|
15
|
+
port = int(os.getenv("MASTER_PORT", "8361"))
|
16
|
+
num_nodes = int(os.getenv("WORLD_SIZE", 1))
|
17
|
+
node_rank = int(os.getenv("RANK", 0))
|
18
|
+
assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
|
19
|
+
|
20
|
+
dist.init_process_group(
|
21
|
+
backend="nccl",
|
22
|
+
init_method=f"tcp://{ip}:{port}",
|
23
|
+
world_size=num_nodes * num_local_ranks,
|
24
|
+
rank=node_rank * num_local_ranks + local_rank,
|
25
|
+
)
|
26
|
+
torch.set_default_dtype(torch.bfloat16)
|
27
|
+
torch.set_default_device("cuda")
|
28
|
+
torch.cuda.set_device(local_rank)
|
29
|
+
|
30
|
+
return (
|
31
|
+
dist.get_rank(),
|
32
|
+
dist.get_world_size(),
|
33
|
+
dist.new_group(list(range(num_local_ranks * num_nodes))),
|
34
|
+
)
|
35
|
+
|
36
|
+
|
37
|
+
def calc_diff(x: torch.Tensor, y: torch.Tensor):
|
38
|
+
x, y = x.double() + 1, y.double() + 1
|
39
|
+
denominator = (x * x + y * y).sum()
|
40
|
+
sim = 2 * (x * y).sum() / denominator
|
41
|
+
return (1 - sim).item()
|
42
|
+
|
43
|
+
|
44
|
+
def per_token_cast_to_fp8(x: torch.Tensor):
|
45
|
+
assert x.dim() == 2 and x.size(1) % 128 == 0
|
46
|
+
m, n = x.shape
|
47
|
+
x_view = x.view(m, -1, 128)
|
48
|
+
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
49
|
+
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
|
50
|
+
m, n
|
51
|
+
), (x_amax / 448.0).view(m, -1)
|
52
|
+
|
53
|
+
|
54
|
+
def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
|
55
|
+
x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
|
56
|
+
x_scales = x_scales.view(x_fp8.size(0), -1, 1)
|
57
|
+
return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
|
58
|
+
|
59
|
+
|
60
|
+
def inplace_unique(x: torch.Tensor, num_slots: int):
|
61
|
+
assert x.dim() == 2
|
62
|
+
mask = x < 0
|
63
|
+
x_padded = x.masked_fill(mask, num_slots)
|
64
|
+
bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
|
65
|
+
bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
|
66
|
+
bin_count = bin_count[:, :num_slots]
|
67
|
+
sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
|
68
|
+
sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
|
69
|
+
sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
|
70
|
+
x[:, :].fill_(-1)
|
71
|
+
valid_len = min(num_slots, x.size(1))
|
72
|
+
x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
|
73
|
+
|
74
|
+
|
75
|
+
def create_grouped_scores(
|
76
|
+
scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int
|
77
|
+
):
|
78
|
+
num_tokens, num_experts = scores.shape
|
79
|
+
scores = scores.view(num_tokens, num_groups, -1)
|
80
|
+
mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
|
81
|
+
mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
|
82
|
+
return (scores * mask).view(num_tokens, num_experts)
|
83
|
+
|
84
|
+
|
85
|
+
def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
|
86
|
+
# Flush L2 cache with 256 MB data
|
87
|
+
torch.cuda.synchronize()
|
88
|
+
cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
|
89
|
+
|
90
|
+
# Warmup
|
91
|
+
for _ in range(num_warmups):
|
92
|
+
fn()
|
93
|
+
|
94
|
+
# Flush L2
|
95
|
+
cache.zero_()
|
96
|
+
|
97
|
+
# Testing
|
98
|
+
start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
|
99
|
+
end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
|
100
|
+
for i in range(num_tests):
|
101
|
+
# Record
|
102
|
+
start_events[i].record()
|
103
|
+
fn()
|
104
|
+
end_events[i].record()
|
105
|
+
if post_fn is not None:
|
106
|
+
post_fn()
|
107
|
+
torch.cuda.synchronize()
|
108
|
+
|
109
|
+
times = np.array(
|
110
|
+
[s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)]
|
111
|
+
)[1:]
|
112
|
+
return np.average(times), np.min(times), np.max(times)
|
113
|
+
|
114
|
+
|
115
|
+
class empty_suppress:
|
116
|
+
def __enter__(self):
|
117
|
+
return self
|
118
|
+
|
119
|
+
def __exit__(self, *_):
|
120
|
+
pass
|
121
|
+
|
122
|
+
|
123
|
+
class suppress_stdout_stderr:
|
124
|
+
def __enter__(self):
|
125
|
+
self.outnull_file = open(os.devnull, "w")
|
126
|
+
self.errnull_file = open(os.devnull, "w")
|
127
|
+
|
128
|
+
self.old_stdout_fileno_undup = sys.stdout.fileno()
|
129
|
+
self.old_stderr_fileno_undup = sys.stderr.fileno()
|
130
|
+
|
131
|
+
self.old_stdout_fileno = os.dup(sys.stdout.fileno())
|
132
|
+
self.old_stderr_fileno = os.dup(sys.stderr.fileno())
|
133
|
+
|
134
|
+
self.old_stdout = sys.stdout
|
135
|
+
self.old_stderr = sys.stderr
|
136
|
+
|
137
|
+
os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
|
138
|
+
os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
|
139
|
+
|
140
|
+
sys.stdout = self.outnull_file
|
141
|
+
sys.stderr = self.errnull_file
|
142
|
+
return self
|
143
|
+
|
144
|
+
def __exit__(self, *_):
|
145
|
+
sys.stdout = self.old_stdout
|
146
|
+
sys.stderr = self.old_stderr
|
147
|
+
|
148
|
+
os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
|
149
|
+
os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
|
150
|
+
|
151
|
+
os.close(self.old_stdout_fileno)
|
152
|
+
os.close(self.old_stderr_fileno)
|
153
|
+
|
154
|
+
self.outnull_file.close()
|
155
|
+
self.errnull_file.close()
|
156
|
+
|
157
|
+
|
158
|
+
def bench_kineto(
|
159
|
+
fn,
|
160
|
+
kernel_names,
|
161
|
+
num_tests: int = 30,
|
162
|
+
suppress_kineto_output: bool = False,
|
163
|
+
trace_path: Optional[str] = None,
|
164
|
+
barrier_comm_profiling: bool = False,
|
165
|
+
):
|
166
|
+
# Profile
|
167
|
+
suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
|
168
|
+
with suppress():
|
169
|
+
schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
|
170
|
+
with torch.profiler.profile(
|
171
|
+
activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
|
172
|
+
) as prof:
|
173
|
+
for i in range(2):
|
174
|
+
# NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
|
175
|
+
if barrier_comm_profiling:
|
176
|
+
lhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
|
177
|
+
rhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
|
178
|
+
lhs @ rhs
|
179
|
+
dist.all_reduce(torch.ones(1, dtype=torch.float, device="cuda"))
|
180
|
+
for _ in range(num_tests):
|
181
|
+
fn()
|
182
|
+
prof.step()
|
183
|
+
|
184
|
+
# Parse the profiling table
|
185
|
+
assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
|
186
|
+
is_tupled = isinstance(kernel_names, tuple)
|
187
|
+
prof_lines = (
|
188
|
+
prof.key_averages()
|
189
|
+
.table(sort_by="cuda_time_total", max_name_column_width=100)
|
190
|
+
.split("\n")
|
191
|
+
)
|
192
|
+
kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
|
193
|
+
assert all([isinstance(name, str) for name in kernel_names])
|
194
|
+
for name in kernel_names:
|
195
|
+
assert (
|
196
|
+
sum([name in line for line in prof_lines]) == 1
|
197
|
+
), f"Errors of the kernel {name} in the profiling table"
|
198
|
+
|
199
|
+
# Save chrome traces
|
200
|
+
if trace_path is not None:
|
201
|
+
prof.export_chrome_trace(trace_path)
|
202
|
+
|
203
|
+
# Return average kernel times
|
204
|
+
units = {"ms": 1e3, "us": 1e6}
|
205
|
+
kernel_times = []
|
206
|
+
for name in kernel_names:
|
207
|
+
for line in prof_lines:
|
208
|
+
if name in line:
|
209
|
+
time_str = line.split()[-2]
|
210
|
+
for unit, scale in units.items():
|
211
|
+
if unit in time_str:
|
212
|
+
kernel_times.append(float(time_str.replace(unit, "")) / scale)
|
213
|
+
break
|
214
|
+
break
|
215
|
+
return tuple(kernel_times) if is_tupled else kernel_times[0]
|
216
|
+
|
217
|
+
|
218
|
+
def hash_tensor(t: torch.Tensor):
|
219
|
+
return t.view(torch.int64).sum().item()
|
sglang/test/test_utils.py
CHANGED
@@ -66,9 +66,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
|
|
66
66
|
)
|
67
67
|
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
68
68
|
DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
|
69
|
+
DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
|
69
70
|
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
70
71
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
71
72
|
)
|
73
|
+
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
|
72
74
|
|
73
75
|
# Nightly tests
|
74
76
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
@@ -77,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
|
|
77
79
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
78
80
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
79
81
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
80
|
-
|
82
|
+
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
|
83
|
+
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
|
81
84
|
|
82
85
|
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
83
86
|
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
@@ -770,6 +773,34 @@ def run_bench_offline_throughput(model, other_args):
|
|
770
773
|
return output_throughput
|
771
774
|
|
772
775
|
|
776
|
+
def run_bench_one_batch_server(
|
777
|
+
model,
|
778
|
+
base_url,
|
779
|
+
server_args,
|
780
|
+
bench_args,
|
781
|
+
other_server_args,
|
782
|
+
simulate_spec_acc_lens=None,
|
783
|
+
):
|
784
|
+
from sglang.bench_one_batch_server import run_benchmark
|
785
|
+
|
786
|
+
if simulate_spec_acc_lens is not None:
|
787
|
+
env = {**os.environ, "SIMULATE_ACC_LEN": str(simulate_spec_acc_lens)}
|
788
|
+
else:
|
789
|
+
env = None
|
790
|
+
|
791
|
+
process = popen_launch_server(
|
792
|
+
model,
|
793
|
+
base_url,
|
794
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
795
|
+
other_args=other_server_args,
|
796
|
+
env=env,
|
797
|
+
)
|
798
|
+
try:
|
799
|
+
run_benchmark(server_args=server_args, bench_args=bench_args)
|
800
|
+
finally:
|
801
|
+
kill_process_tree(process.pid)
|
802
|
+
|
803
|
+
|
773
804
|
def lcs(X, Y):
|
774
805
|
m = len(X)
|
775
806
|
n = len(Y)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.6.
|
1
|
+
__version__ = "0.4.6.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.6.
|
3
|
+
Version: 0.4.6.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
230
230
|
Requires-Dist: ninja; extra == "runtime-common"
|
231
231
|
Requires-Dist: orjson; extra == "runtime-common"
|
232
232
|
Requires-Dist: packaging; extra == "runtime-common"
|
233
|
+
Requires-Dist: partial_json_parser; extra == "runtime-common"
|
233
234
|
Requires-Dist: pillow; extra == "runtime-common"
|
234
235
|
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
235
236
|
Requires-Dist: psutil; extra == "runtime-common"
|
@@ -238,20 +239,20 @@ Requires-Dist: pynvml; extra == "runtime-common"
|
|
238
239
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
239
240
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
240
241
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
241
|
-
Requires-Dist: torchao>=0.
|
242
|
+
Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
|
242
243
|
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
244
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
245
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
|
-
Requires-Dist: xgrammar==0.1.
|
246
|
+
Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
|
247
|
+
Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
246
248
|
Provides-Extra: srt
|
247
249
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
248
|
-
Requires-Dist: sgl-kernel==0.1.
|
249
|
-
Requires-Dist: flashinfer_python==0.2.
|
250
|
+
Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
|
251
|
+
Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
|
250
252
|
Requires-Dist: torch==2.6.0; extra == "srt"
|
251
253
|
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
252
254
|
Requires-Dist: cuda-python; extra == "srt"
|
253
255
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
254
|
-
Requires-Dist: partial_json_parser; extra == "srt"
|
255
256
|
Requires-Dist: einops; extra == "srt"
|
256
257
|
Provides-Extra: blackwell
|
257
258
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
@@ -260,7 +261,6 @@ Requires-Dist: torch; extra == "blackwell"
|
|
260
261
|
Requires-Dist: torchvision; extra == "blackwell"
|
261
262
|
Requires-Dist: cuda-python; extra == "blackwell"
|
262
263
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
|
263
|
-
Requires-Dist: partial_json_parser; extra == "blackwell"
|
264
264
|
Requires-Dist: einops; extra == "blackwell"
|
265
265
|
Provides-Extra: srt-hip
|
266
266
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
@@ -277,6 +277,9 @@ Provides-Extra: srt-cpu
|
|
277
277
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
278
278
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
|
279
279
|
Requires-Dist: torch; extra == "srt-cpu"
|
280
|
+
Provides-Extra: srt-npu
|
281
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
282
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
|
280
283
|
Provides-Extra: openai
|
281
284
|
Requires-Dist: openai>=1.0; extra == "openai"
|
282
285
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -318,6 +321,11 @@ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
|
|
318
321
|
Requires-Dist: sglang[openai]; extra == "all-cpu"
|
319
322
|
Requires-Dist: sglang[anthropic]; extra == "all-cpu"
|
320
323
|
Requires-Dist: sglang[litellm]; extra == "all-cpu"
|
324
|
+
Provides-Extra: all-npu
|
325
|
+
Requires-Dist: sglang[srt_npu]; extra == "all-npu"
|
326
|
+
Requires-Dist: sglang[openai]; extra == "all-npu"
|
327
|
+
Requires-Dist: sglang[anthropic]; extra == "all-npu"
|
328
|
+
Requires-Dist: sglang[litellm]; extra == "all-npu"
|
321
329
|
Provides-Extra: dev
|
322
330
|
Requires-Dist: sglang[all]; extra == "dev"
|
323
331
|
Requires-Dist: sglang[test]; extra == "dev"
|
@@ -357,6 +365,7 @@ Dynamic: license-file
|
|
357
365
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
358
366
|
|
359
367
|
## News
|
368
|
+
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
360
369
|
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
361
370
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
362
371
|
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
@@ -382,7 +391,7 @@ The core features include:
|
|
382
391
|
|
383
392
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
384
393
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
385
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral,
|
394
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
386
395
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
387
396
|
|
388
397
|
## Getting Started
|
@@ -400,7 +409,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
400
409
|
|
401
410
|
## Adoption and Sponsorship
|
402
411
|
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
403
|
-
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
412
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
404
413
|
|
405
414
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
406
415
|
|