sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_one_batch.py +8 -6
- sglang/bench_serving.py +1 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/srt/_custom_ops.py +2 -2
- sglang/srt/code_completion_parser.py +2 -44
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/constants.py +3 -0
- sglang/srt/conversation.py +19 -3
- sglang/srt/custom_op.py +5 -1
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -1
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +211 -72
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +15 -9
- sglang/srt/disaggregation/mini_lb.py +34 -4
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +30 -29
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +17 -12
- sglang/srt/disaggregation/prefill.py +144 -55
- sglang/srt/disaggregation/utils.py +155 -123
- sglang/srt/distributed/parallel_state.py +12 -4
- sglang/srt/entrypoints/engine.py +37 -29
- sglang/srt/entrypoints/http_server.py +153 -72
- sglang/srt/entrypoints/http_server_engine.py +0 -3
- sglang/srt/entrypoints/openai/__init__.py +0 -0
- sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
- sglang/srt/entrypoints/openai/serving_base.py +149 -0
- sglang/srt/entrypoints/openai/serving_chat.py +921 -0
- sglang/srt/entrypoints/openai/serving_completions.py +424 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
- sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
- sglang/srt/entrypoints/openai/serving_score.py +61 -0
- sglang/srt/entrypoints/openai/usage_processor.py +81 -0
- sglang/srt/entrypoints/openai/utils.py +72 -0
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +7 -4
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/function_call/ebnf_composer.py +64 -10
- sglang/srt/function_call/function_call_parser.py +6 -6
- sglang/srt/function_call/llama32_detector.py +1 -1
- sglang/srt/function_call/mistral_detector.py +1 -1
- sglang/srt/function_call/pythonic_detector.py +1 -1
- sglang/srt/function_call/qwen25_detector.py +1 -1
- sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
- sglang/srt/layers/activation.py +40 -3
- sglang/srt/layers/attention/aiter_backend.py +20 -4
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
- sglang/srt/layers/attention/flashattention_backend.py +71 -72
- sglang/srt/layers/attention/flashinfer_backend.py +10 -8
- sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -12
- sglang/srt/layers/attention/tbo_backend.py +3 -3
- sglang/srt/layers/attention/triton_backend.py +138 -130
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +28 -10
- sglang/srt/layers/dp_attention.py +11 -2
- sglang/srt/layers/layernorm.py +29 -2
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +2 -14
- sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
- sglang/srt/layers/moe/ep_moe/layer.py +249 -33
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
- sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
- sglang/srt/layers/moe/topk.py +107 -12
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +25 -17
- sglang/srt/layers/quantization/fp8_kernel.py +44 -15
- sglang/srt/layers/quantization/fp8_utils.py +87 -22
- sglang/srt/layers/quantization/modelopt_quant.py +62 -8
- sglang/srt/layers/quantization/utils.py +5 -2
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +42 -2
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/lora_manager.py +249 -105
- sglang/srt/lora/mem_pool.py +53 -50
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +33 -14
- sglang/srt/managers/io_struct.py +31 -10
- sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +79 -37
- sglang/srt/managers/schedule_policy.py +70 -56
- sglang/srt/managers/scheduler.py +220 -79
- sglang/srt/managers/template_manager.py +226 -0
- sglang/srt/managers/tokenizer_manager.py +40 -10
- sglang/srt/managers/tp_worker.py +12 -2
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
- sglang/srt/mem_cache/base_prefix_cache.py +52 -8
- sglang/srt/mem_cache/chunk_cache.py +11 -15
- sglang/srt/mem_cache/hiradix_cache.py +38 -25
- sglang/srt/mem_cache/memory_pool.py +213 -505
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +56 -28
- sglang/srt/model_executor/cuda_graph_runner.py +198 -100
- sglang/srt/model_executor/forward_batch_info.py +32 -10
- sglang/srt/model_executor/model_runner.py +28 -12
- sglang/srt/model_loader/loader.py +16 -2
- sglang/srt/model_loader/weight_utils.py +11 -2
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +29 -27
- sglang/srt/models/deepseek_v2.py +213 -173
- sglang/srt/models/glm4.py +312 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/mimo_mtp.py +2 -18
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/vila.py +305 -0
- sglang/srt/reasoning_parser.py +21 -11
- sglang/srt/sampling/sampling_batch_info.py +24 -0
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +351 -238
- sglang/srt/speculative/build_eagle_tree.py +1 -1
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
- sglang/srt/speculative/eagle_utils.py +468 -116
- sglang/srt/speculative/eagle_worker.py +258 -84
- sglang/srt/torch_memory_saver_adapter.py +19 -15
- sglang/srt/two_batch_overlap.py +4 -2
- sglang/srt/utils.py +235 -11
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +38 -3
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +2 -0
- sglang/test/test_utils.py +4 -1
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
- sglang/srt/entrypoints/verl_engine.py +0 -179
- sglang/srt/openai_api/adapter.py +0 -1990
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,252 @@
|
|
1
|
+
import itertools
|
2
|
+
import os
|
3
|
+
import unittest
|
4
|
+
from typing import List, Tuple
|
5
|
+
|
6
|
+
import torch
|
7
|
+
from deep_gemm import fp8_gemm_nt
|
8
|
+
|
9
|
+
from sglang.test.test_utils import CustomTestCase
|
10
|
+
|
11
|
+
_is_cuda = torch.cuda.is_available() and torch.version.cuda
|
12
|
+
|
13
|
+
|
14
|
+
# Modify form DeepGEMM Blackwell
|
15
|
+
def ceil_div(x: int, y: int) -> int:
|
16
|
+
return (x + y - 1) // y
|
17
|
+
|
18
|
+
|
19
|
+
def align(x: int, y: int) -> int:
|
20
|
+
return ceil_div(x, y) * y
|
21
|
+
|
22
|
+
|
23
|
+
def per_token_group_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
24
|
+
assert x.dim() == 2 and x.size(1) % 128 == 0
|
25
|
+
m, n = x.shape
|
26
|
+
x_view = x.view(m, -1, 128)
|
27
|
+
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
28
|
+
sf = x_amax / 448.0
|
29
|
+
return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
|
30
|
+
|
31
|
+
|
32
|
+
def per_block_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
33
|
+
assert x.dim() == 2
|
34
|
+
m, n = x.shape
|
35
|
+
x_padded = torch.zeros(
|
36
|
+
(align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
|
37
|
+
)
|
38
|
+
x_padded[:m, :n] = x
|
39
|
+
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
40
|
+
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
41
|
+
sf = x_amax / 448.0
|
42
|
+
x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
|
43
|
+
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
|
44
|
+
x_view.size(0), x_view.size(2)
|
45
|
+
)
|
46
|
+
|
47
|
+
|
48
|
+
def ceil_to_ue8m0(x: torch.Tensor):
|
49
|
+
assert x.view(-1).amax().item() > 0
|
50
|
+
return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
|
51
|
+
|
52
|
+
|
53
|
+
def per_token_group_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
54
|
+
assert x.dim() == 2 and x.size(1) % 128 == 0
|
55
|
+
m, n = x.shape
|
56
|
+
x_view = x.view(m, -1, 128)
|
57
|
+
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
58
|
+
sf = ceil_to_ue8m0(x_amax / 448.0)
|
59
|
+
return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
|
60
|
+
|
61
|
+
|
62
|
+
def per_block_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
63
|
+
assert x.dim() == 2
|
64
|
+
m, n = x.shape
|
65
|
+
x_padded = torch.zeros(
|
66
|
+
(align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
|
67
|
+
)
|
68
|
+
x_padded[:m, :n] = x
|
69
|
+
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
70
|
+
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
71
|
+
sf = ceil_to_ue8m0(x_amax / 448.0)
|
72
|
+
x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
|
73
|
+
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
|
74
|
+
x_view.size(0), x_view.size(2)
|
75
|
+
)
|
76
|
+
|
77
|
+
|
78
|
+
# For test
|
79
|
+
def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
|
80
|
+
"""This function performs matrix multiplication with block-wise quantization using native torch.
|
81
|
+
|
82
|
+
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
|
83
|
+
The output is returned in the specified `output_dtype`.
|
84
|
+
"""
|
85
|
+
|
86
|
+
A = A.to(torch.float32)
|
87
|
+
B = B.to(torch.float32)
|
88
|
+
assert A.shape[-1] == B.shape[-1]
|
89
|
+
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
90
|
+
assert len(block_size) == 2
|
91
|
+
block_n, block_k = block_size[0], block_size[1]
|
92
|
+
assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
|
93
|
+
assert A.shape[:-1] == As.shape[:-1]
|
94
|
+
|
95
|
+
M = A.numel() // A.shape[-1]
|
96
|
+
N, K = B.shape
|
97
|
+
origin_C_shape = A.shape[:-1] + (N,)
|
98
|
+
A = A.reshape(M, A.shape[-1])
|
99
|
+
As = As.reshape(M, As.shape[-1])
|
100
|
+
n_tiles = (N + block_n - 1) // block_n
|
101
|
+
k_tiles = (K + block_k - 1) // block_k
|
102
|
+
assert n_tiles == Bs.shape[0]
|
103
|
+
assert k_tiles == Bs.shape[1]
|
104
|
+
|
105
|
+
C_shape = (M, N)
|
106
|
+
C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
|
107
|
+
|
108
|
+
A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
|
109
|
+
B_tiles = [
|
110
|
+
[
|
111
|
+
B[
|
112
|
+
j * block_n : min((j + 1) * block_n, N),
|
113
|
+
i * block_k : min((i + 1) * block_k, K),
|
114
|
+
]
|
115
|
+
for i in range(k_tiles)
|
116
|
+
]
|
117
|
+
for j in range(n_tiles)
|
118
|
+
]
|
119
|
+
C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
|
120
|
+
As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
|
121
|
+
|
122
|
+
for i in range(k_tiles):
|
123
|
+
for j in range(n_tiles):
|
124
|
+
a = A_tiles[i]
|
125
|
+
b = B_tiles[j][i]
|
126
|
+
c = C_tiles[j]
|
127
|
+
s = As_tiles[i] * Bs[j][i]
|
128
|
+
c[:, :] += torch.matmul(a, b.t()) * s
|
129
|
+
|
130
|
+
C = C.reshape(origin_C_shape).to(output_dtype)
|
131
|
+
return C
|
132
|
+
|
133
|
+
|
134
|
+
def block_quant_dequant(
|
135
|
+
x_q_block: torch.Tensor,
|
136
|
+
x_s: torch.Tensor,
|
137
|
+
block_size: List[int],
|
138
|
+
dtype: torch.dtype,
|
139
|
+
) -> torch.Tensor:
|
140
|
+
"""This function converts block-wise quantization to unquantized.
|
141
|
+
The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
|
142
|
+
and the block size.
|
143
|
+
The output is an unquantized tensor with dtype.
|
144
|
+
"""
|
145
|
+
block_n, block_k = block_size[0], block_size[1]
|
146
|
+
n, k = x_q_block.shape
|
147
|
+
n_tiles = (n + block_n - 1) // block_n
|
148
|
+
k_tiles = (k + block_k - 1) // block_k
|
149
|
+
assert n_tiles == x_s.shape[0]
|
150
|
+
assert k_tiles == x_s.shape[1]
|
151
|
+
|
152
|
+
x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
|
153
|
+
|
154
|
+
for j in range(n_tiles):
|
155
|
+
for i in range(k_tiles):
|
156
|
+
x_q_block_tile = x_q_block[
|
157
|
+
j * block_n : min((j + 1) * block_n, n),
|
158
|
+
i * block_k : min((i + 1) * block_k, k),
|
159
|
+
]
|
160
|
+
x_dq_block_tile = x_dq_block[
|
161
|
+
j * block_n : min((j + 1) * block_n, n),
|
162
|
+
i * block_k : min((i + 1) * block_k, k),
|
163
|
+
]
|
164
|
+
x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
|
165
|
+
|
166
|
+
return x_dq_block
|
167
|
+
|
168
|
+
|
169
|
+
class TestDeepGemmBlackwell(CustomTestCase):
|
170
|
+
|
171
|
+
if not _is_cuda:
|
172
|
+
OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
173
|
+
M = [1, 7, 83, 512, 2048]
|
174
|
+
NKs = [
|
175
|
+
(N, K)
|
176
|
+
for N in [128, 512, 1024, 4096, 7748, 13824]
|
177
|
+
for K in [256, 4096, 5120, 3884, 13824]
|
178
|
+
]
|
179
|
+
# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
|
180
|
+
BLOCK_SIZE = [[128, 128]]
|
181
|
+
SEEDS = [0]
|
182
|
+
else:
|
183
|
+
# use practical shape in DeepSeek V3 for test
|
184
|
+
OUT_DTYPES = [torch.bfloat16]
|
185
|
+
M = [64, 128, 512, 1024, 4096]
|
186
|
+
NKs = [
|
187
|
+
(2112, 7168),
|
188
|
+
(1536, 7168),
|
189
|
+
# (3072, 1536),
|
190
|
+
# (24576, 7168),
|
191
|
+
# (4096, 512),
|
192
|
+
# (7168, 2048),
|
193
|
+
# (4608, 7168),
|
194
|
+
# (512, 7168),
|
195
|
+
# (7168, 2304),
|
196
|
+
# (7168, 512),
|
197
|
+
]
|
198
|
+
BLOCK_SIZE = [[128, 128]]
|
199
|
+
SEEDS = [0]
|
200
|
+
|
201
|
+
@classmethod
|
202
|
+
def setUpClass(cls):
|
203
|
+
if not torch.cuda.is_available():
|
204
|
+
raise unittest.SkipTest("CUDA is not available")
|
205
|
+
torch.set_default_device("cuda")
|
206
|
+
|
207
|
+
def _test_deep_gemm_blackwell(self, M, NK, block_size, out_dtype, seed):
|
208
|
+
N, K = NK
|
209
|
+
torch.manual_seed(seed)
|
210
|
+
|
211
|
+
A = torch.empty((M, K), dtype=torch.bfloat16).normal_(0, 0.2)
|
212
|
+
B = torch.empty((N, K), dtype=torch.bfloat16).normal_(0, 0.2)
|
213
|
+
|
214
|
+
A_q, A_s = per_token_group_quant_fp8(A)
|
215
|
+
B_q, B_s = per_block_quant_fp8(B)
|
216
|
+
|
217
|
+
A_dq = block_quant_dequant(A_q, A_s, [1, block_size[1]], out_dtype)
|
218
|
+
B_dq = block_quant_dequant(B_q, B_s, block_size, out_dtype)
|
219
|
+
|
220
|
+
A_qu = per_token_group_quant_mxfp8(A_dq)
|
221
|
+
B_qu = per_block_quant_mxfp8(B_dq)
|
222
|
+
out = None
|
223
|
+
|
224
|
+
with torch.inference_mode():
|
225
|
+
ref_out = native_w8a8_block_fp8_matmul(
|
226
|
+
A_q, B_q, A_s, B_s, block_size, out_dtype
|
227
|
+
)
|
228
|
+
out = torch.empty_like(ref_out)
|
229
|
+
fp8_gemm_nt(A_qu, B_qu, out)
|
230
|
+
|
231
|
+
torch.testing.assert_close(out, ref_out, atol=1e-1, rtol=1e-2)
|
232
|
+
|
233
|
+
def test_deep_gemm_blackwell(self):
|
234
|
+
for params in itertools.product(
|
235
|
+
self.M,
|
236
|
+
self.NKs,
|
237
|
+
self.BLOCK_SIZE,
|
238
|
+
self.OUT_DTYPES,
|
239
|
+
self.SEEDS,
|
240
|
+
):
|
241
|
+
with self.subTest(
|
242
|
+
M=params[0],
|
243
|
+
NKs=params[1],
|
244
|
+
block_size=params[2],
|
245
|
+
out_dtype=params[3],
|
246
|
+
seed=params[4],
|
247
|
+
):
|
248
|
+
self._test_deep_gemm_blackwell(*params)
|
249
|
+
|
250
|
+
|
251
|
+
if __name__ == "__main__":
|
252
|
+
unittest.main(verbosity=2)
|
sglang/test/test_block_fp8_ep.py
CHANGED
@@ -84,6 +84,7 @@ def ep_moe(
|
|
84
84
|
top_k,
|
85
85
|
hidden_states.shape[1],
|
86
86
|
BLOCK_SIZE=512,
|
87
|
+
use_per_token_if_dynamic=True,
|
87
88
|
)
|
88
89
|
|
89
90
|
seg_indptr_cur_rank = seg_indptr[start_expert_id : end_expert_id + 2]
|
@@ -181,6 +182,7 @@ def ep_moe(
|
|
181
182
|
end_expert_id,
|
182
183
|
top_k,
|
183
184
|
hidden_states.size(1),
|
185
|
+
0,
|
184
186
|
BLOCK_SIZE=512,
|
185
187
|
)
|
186
188
|
return output
|
sglang/test/test_utils.py
CHANGED
@@ -37,10 +37,13 @@ from sglang.utils import get_exception_traceback
|
|
37
37
|
# General test models
|
38
38
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
39
39
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
40
|
+
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
40
41
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
41
42
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
42
43
|
|
43
44
|
# MLA test models
|
45
|
+
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
46
|
+
DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST = "cross-encoder/ms-marco-MiniLM-L6-v2"
|
44
47
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
45
48
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
46
49
|
DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
|
@@ -85,7 +88,7 @@ DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
|
|
85
88
|
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
86
89
|
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
87
90
|
|
88
|
-
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH =
|
91
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
89
92
|
|
90
93
|
|
91
94
|
def is_in_ci():
|
sglang/utils.py
CHANGED
@@ -512,3 +512,12 @@ async def async_stream_and_merge(llm, prompt, sampling_params):
|
|
512
512
|
cleaned_chunk = trim_overlap(final_text, chunk_text)
|
513
513
|
final_text += cleaned_chunk
|
514
514
|
yield cleaned_chunk # yield the non-overlapping portion
|
515
|
+
|
516
|
+
|
517
|
+
def resolve_obj_by_qualname(qualname: str) -> Any:
|
518
|
+
"""
|
519
|
+
Resolve an object by its fully qualified name.
|
520
|
+
"""
|
521
|
+
module_name, obj_name = qualname.rsplit(".", 1)
|
522
|
+
module = importlib.import_module(module_name)
|
523
|
+
return getattr(module, obj_name)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.8"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.8
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
230
230
|
Requires-Dist: msgspec; extra == "runtime-common"
|
231
231
|
Requires-Dist: ninja; extra == "runtime-common"
|
232
232
|
Requires-Dist: orjson; extra == "runtime-common"
|
233
|
+
Requires-Dist: outlines==0.1.11; extra == "runtime-common"
|
233
234
|
Requires-Dist: packaging; extra == "runtime-common"
|
234
235
|
Requires-Dist: partial_json_parser; extra == "runtime-common"
|
235
236
|
Requires-Dist: pillow; extra == "runtime-common"
|
@@ -248,14 +249,13 @@ Requires-Dist: uvloop; extra == "runtime-common"
|
|
248
249
|
Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
|
249
250
|
Provides-Extra: srt
|
250
251
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
251
|
-
Requires-Dist: sgl-kernel==0.1.
|
252
|
-
Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
|
252
|
+
Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
|
253
253
|
Requires-Dist: torch==2.7.1; extra == "srt"
|
254
254
|
Requires-Dist: torchaudio==2.7.1; extra == "srt"
|
255
255
|
Requires-Dist: torchvision==0.22.1; extra == "srt"
|
256
256
|
Requires-Dist: cuda-python; extra == "srt"
|
257
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
258
257
|
Requires-Dist: einops; extra == "srt"
|
258
|
+
Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
|
259
259
|
Provides-Extra: blackwell
|
260
260
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
261
261
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -263,27 +263,21 @@ Requires-Dist: torch==2.7.1; extra == "blackwell"
|
|
263
263
|
Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
|
264
264
|
Requires-Dist: torchvision==0.22.1; extra == "blackwell"
|
265
265
|
Requires-Dist: cuda-python; extra == "blackwell"
|
266
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
|
267
266
|
Requires-Dist: einops; extra == "blackwell"
|
268
267
|
Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
|
269
268
|
Provides-Extra: srt-hip
|
270
269
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
271
270
|
Requires-Dist: torch; extra == "srt-hip"
|
272
271
|
Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
|
273
|
-
Requires-Dist: outlines==0.1.11; extra == "srt-hip"
|
274
272
|
Provides-Extra: srt-xpu
|
275
273
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
276
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-xpu"
|
277
274
|
Provides-Extra: srt-hpu
|
278
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
279
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-hpu"
|
280
276
|
Provides-Extra: srt-cpu
|
281
277
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
282
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
|
283
278
|
Requires-Dist: einops; extra == "srt-cpu"
|
284
279
|
Provides-Extra: srt-npu
|
285
280
|
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
286
|
-
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
|
287
281
|
Provides-Extra: openai
|
288
282
|
Requires-Dist: openai>=1.0; extra == "openai"
|
289
283
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -292,7 +286,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
|
292
286
|
Provides-Extra: litellm
|
293
287
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
294
288
|
Provides-Extra: torch-memory-saver
|
295
|
-
Requires-Dist: torch_memory_saver>=0.0.
|
289
|
+
Requires-Dist: torch_memory_saver>=0.0.8; extra == "torch-memory-saver"
|
296
290
|
Provides-Extra: decord
|
297
291
|
Requires-Dist: decord; extra == "decord"
|
298
292
|
Provides-Extra: test
|
@@ -371,7 +365,7 @@ Dynamic: license-file
|
|
371
365
|
|
372
366
|
--------------------------------------------------------------------------------
|
373
367
|
|
374
|
-
| [**Blog**](https://lmsys.org/blog/
|
368
|
+
| [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
|
375
369
|
| [**Documentation**](https://docs.sglang.ai/)
|
376
370
|
| [**Join Slack**](https://slack.sglang.ai/)
|
377
371
|
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
@@ -403,7 +397,7 @@ SGLang is a fast serving framework for large language models and vision language
|
|
403
397
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
404
398
|
The core features include:
|
405
399
|
|
406
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching,
|
400
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
407
401
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
408
402
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
409
403
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
@@ -422,7 +416,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
422
416
|
[Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
|
423
417
|
|
424
418
|
## Adoption and Sponsorship
|
425
|
-
SGLang has been deployed at large scale,
|
419
|
+
SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
|
426
420
|
|
427
421
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
428
422
|
|