sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_serving.py +1 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/conversation.py +6 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -1
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +196 -51
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +15 -9
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +18 -13
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +17 -12
- sglang/srt/disaggregation/prefill.py +128 -43
- sglang/srt/disaggregation/utils.py +127 -123
- sglang/srt/entrypoints/engine.py +15 -1
- sglang/srt/entrypoints/http_server.py +13 -2
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +15 -2
- sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
- sglang/srt/layers/attention/flashattention_backend.py +53 -64
- sglang/srt/layers/attention/flashinfer_backend.py +1 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/triton_backend.py +119 -119
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +23 -5
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
- sglang/srt/layers/moe/ep_moe/layer.py +42 -32
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
- sglang/srt/layers/moe/topk.py +16 -8
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8_kernel.py +44 -15
- sglang/srt/layers/quantization/fp8_utils.py +87 -22
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/lora/lora_manager.py +79 -34
- sglang/srt/lora/mem_pool.py +4 -5
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/io_struct.py +28 -4
- sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +39 -6
- sglang/srt/managers/scheduler.py +73 -17
- sglang/srt/managers/tokenizer_manager.py +29 -2
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/model_executor/cuda_graph_runner.py +122 -55
- sglang/srt/model_executor/forward_batch_info.py +14 -5
- sglang/srt/model_executor/model_runner.py +6 -6
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_v2.py +113 -155
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +162 -4
- sglang/srt/openai_api/protocol.py +37 -1
- sglang/srt/sampling/sampling_batch_info.py +24 -0
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +318 -233
- sglang/srt/speculative/build_eagle_tree.py +1 -1
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
- sglang/srt/speculative/eagle_utils.py +389 -109
- sglang/srt/speculative/eagle_worker.py +134 -43
- sglang/srt/two_batch_overlap.py +4 -2
- sglang/srt/utils.py +58 -0
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +38 -3
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_utils.py +3 -1
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,252 @@
|
|
1
|
+
import itertools
|
2
|
+
import os
|
3
|
+
import unittest
|
4
|
+
from typing import List, Tuple
|
5
|
+
|
6
|
+
import torch
|
7
|
+
from deep_gemm import fp8_gemm_nt
|
8
|
+
|
9
|
+
from sglang.test.test_utils import CustomTestCase
|
10
|
+
|
11
|
+
_is_cuda = torch.cuda.is_available() and torch.version.cuda
|
12
|
+
|
13
|
+
|
14
|
+
# Modify form DeepGEMM Blackwell
|
15
|
+
def ceil_div(x: int, y: int) -> int:
|
16
|
+
return (x + y - 1) // y
|
17
|
+
|
18
|
+
|
19
|
+
def align(x: int, y: int) -> int:
|
20
|
+
return ceil_div(x, y) * y
|
21
|
+
|
22
|
+
|
23
|
+
def per_token_group_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
24
|
+
assert x.dim() == 2 and x.size(1) % 128 == 0
|
25
|
+
m, n = x.shape
|
26
|
+
x_view = x.view(m, -1, 128)
|
27
|
+
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
28
|
+
sf = x_amax / 448.0
|
29
|
+
return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
|
30
|
+
|
31
|
+
|
32
|
+
def per_block_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
33
|
+
assert x.dim() == 2
|
34
|
+
m, n = x.shape
|
35
|
+
x_padded = torch.zeros(
|
36
|
+
(align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
|
37
|
+
)
|
38
|
+
x_padded[:m, :n] = x
|
39
|
+
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
40
|
+
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
41
|
+
sf = x_amax / 448.0
|
42
|
+
x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
|
43
|
+
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
|
44
|
+
x_view.size(0), x_view.size(2)
|
45
|
+
)
|
46
|
+
|
47
|
+
|
48
|
+
def ceil_to_ue8m0(x: torch.Tensor):
|
49
|
+
assert x.view(-1).amax().item() > 0
|
50
|
+
return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
|
51
|
+
|
52
|
+
|
53
|
+
def per_token_group_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
54
|
+
assert x.dim() == 2 and x.size(1) % 128 == 0
|
55
|
+
m, n = x.shape
|
56
|
+
x_view = x.view(m, -1, 128)
|
57
|
+
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
58
|
+
sf = ceil_to_ue8m0(x_amax / 448.0)
|
59
|
+
return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
|
60
|
+
|
61
|
+
|
62
|
+
def per_block_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
63
|
+
assert x.dim() == 2
|
64
|
+
m, n = x.shape
|
65
|
+
x_padded = torch.zeros(
|
66
|
+
(align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
|
67
|
+
)
|
68
|
+
x_padded[:m, :n] = x
|
69
|
+
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
70
|
+
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
71
|
+
sf = ceil_to_ue8m0(x_amax / 448.0)
|
72
|
+
x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
|
73
|
+
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
|
74
|
+
x_view.size(0), x_view.size(2)
|
75
|
+
)
|
76
|
+
|
77
|
+
|
78
|
+
# For test
|
79
|
+
def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
|
80
|
+
"""This function performs matrix multiplication with block-wise quantization using native torch.
|
81
|
+
|
82
|
+
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
|
83
|
+
The output is returned in the specified `output_dtype`.
|
84
|
+
"""
|
85
|
+
|
86
|
+
A = A.to(torch.float32)
|
87
|
+
B = B.to(torch.float32)
|
88
|
+
assert A.shape[-1] == B.shape[-1]
|
89
|
+
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
90
|
+
assert len(block_size) == 2
|
91
|
+
block_n, block_k = block_size[0], block_size[1]
|
92
|
+
assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
|
93
|
+
assert A.shape[:-1] == As.shape[:-1]
|
94
|
+
|
95
|
+
M = A.numel() // A.shape[-1]
|
96
|
+
N, K = B.shape
|
97
|
+
origin_C_shape = A.shape[:-1] + (N,)
|
98
|
+
A = A.reshape(M, A.shape[-1])
|
99
|
+
As = As.reshape(M, As.shape[-1])
|
100
|
+
n_tiles = (N + block_n - 1) // block_n
|
101
|
+
k_tiles = (K + block_k - 1) // block_k
|
102
|
+
assert n_tiles == Bs.shape[0]
|
103
|
+
assert k_tiles == Bs.shape[1]
|
104
|
+
|
105
|
+
C_shape = (M, N)
|
106
|
+
C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
|
107
|
+
|
108
|
+
A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
|
109
|
+
B_tiles = [
|
110
|
+
[
|
111
|
+
B[
|
112
|
+
j * block_n : min((j + 1) * block_n, N),
|
113
|
+
i * block_k : min((i + 1) * block_k, K),
|
114
|
+
]
|
115
|
+
for i in range(k_tiles)
|
116
|
+
]
|
117
|
+
for j in range(n_tiles)
|
118
|
+
]
|
119
|
+
C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
|
120
|
+
As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
|
121
|
+
|
122
|
+
for i in range(k_tiles):
|
123
|
+
for j in range(n_tiles):
|
124
|
+
a = A_tiles[i]
|
125
|
+
b = B_tiles[j][i]
|
126
|
+
c = C_tiles[j]
|
127
|
+
s = As_tiles[i] * Bs[j][i]
|
128
|
+
c[:, :] += torch.matmul(a, b.t()) * s
|
129
|
+
|
130
|
+
C = C.reshape(origin_C_shape).to(output_dtype)
|
131
|
+
return C
|
132
|
+
|
133
|
+
|
134
|
+
def block_quant_dequant(
|
135
|
+
x_q_block: torch.Tensor,
|
136
|
+
x_s: torch.Tensor,
|
137
|
+
block_size: List[int],
|
138
|
+
dtype: torch.dtype,
|
139
|
+
) -> torch.Tensor:
|
140
|
+
"""This function converts block-wise quantization to unquantized.
|
141
|
+
The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
|
142
|
+
and the block size.
|
143
|
+
The output is an unquantized tensor with dtype.
|
144
|
+
"""
|
145
|
+
block_n, block_k = block_size[0], block_size[1]
|
146
|
+
n, k = x_q_block.shape
|
147
|
+
n_tiles = (n + block_n - 1) // block_n
|
148
|
+
k_tiles = (k + block_k - 1) // block_k
|
149
|
+
assert n_tiles == x_s.shape[0]
|
150
|
+
assert k_tiles == x_s.shape[1]
|
151
|
+
|
152
|
+
x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
|
153
|
+
|
154
|
+
for j in range(n_tiles):
|
155
|
+
for i in range(k_tiles):
|
156
|
+
x_q_block_tile = x_q_block[
|
157
|
+
j * block_n : min((j + 1) * block_n, n),
|
158
|
+
i * block_k : min((i + 1) * block_k, k),
|
159
|
+
]
|
160
|
+
x_dq_block_tile = x_dq_block[
|
161
|
+
j * block_n : min((j + 1) * block_n, n),
|
162
|
+
i * block_k : min((i + 1) * block_k, k),
|
163
|
+
]
|
164
|
+
x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
|
165
|
+
|
166
|
+
return x_dq_block
|
167
|
+
|
168
|
+
|
169
|
+
class TestDeepGemmBlackwell(CustomTestCase):
|
170
|
+
|
171
|
+
if not _is_cuda:
|
172
|
+
OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
173
|
+
M = [1, 7, 83, 512, 2048]
|
174
|
+
NKs = [
|
175
|
+
(N, K)
|
176
|
+
for N in [128, 512, 1024, 4096, 7748, 13824]
|
177
|
+
for K in [256, 4096, 5120, 3884, 13824]
|
178
|
+
]
|
179
|
+
# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
|
180
|
+
BLOCK_SIZE = [[128, 128]]
|
181
|
+
SEEDS = [0]
|
182
|
+
else:
|
183
|
+
# use practical shape in DeepSeek V3 for test
|
184
|
+
OUT_DTYPES = [torch.bfloat16]
|
185
|
+
M = [64, 128, 512, 1024, 4096]
|
186
|
+
NKs = [
|
187
|
+
(2112, 7168),
|
188
|
+
(1536, 7168),
|
189
|
+
# (3072, 1536),
|
190
|
+
# (24576, 7168),
|
191
|
+
# (4096, 512),
|
192
|
+
# (7168, 2048),
|
193
|
+
# (4608, 7168),
|
194
|
+
# (512, 7168),
|
195
|
+
# (7168, 2304),
|
196
|
+
# (7168, 512),
|
197
|
+
]
|
198
|
+
BLOCK_SIZE = [[128, 128]]
|
199
|
+
SEEDS = [0]
|
200
|
+
|
201
|
+
@classmethod
|
202
|
+
def setUpClass(cls):
|
203
|
+
if not torch.cuda.is_available():
|
204
|
+
raise unittest.SkipTest("CUDA is not available")
|
205
|
+
torch.set_default_device("cuda")
|
206
|
+
|
207
|
+
def _test_deep_gemm_blackwell(self, M, NK, block_size, out_dtype, seed):
|
208
|
+
N, K = NK
|
209
|
+
torch.manual_seed(seed)
|
210
|
+
|
211
|
+
A = torch.empty((M, K), dtype=torch.bfloat16).normal_(0, 0.2)
|
212
|
+
B = torch.empty((N, K), dtype=torch.bfloat16).normal_(0, 0.2)
|
213
|
+
|
214
|
+
A_q, A_s = per_token_group_quant_fp8(A)
|
215
|
+
B_q, B_s = per_block_quant_fp8(B)
|
216
|
+
|
217
|
+
A_dq = block_quant_dequant(A_q, A_s, [1, block_size[1]], out_dtype)
|
218
|
+
B_dq = block_quant_dequant(B_q, B_s, block_size, out_dtype)
|
219
|
+
|
220
|
+
A_qu = per_token_group_quant_mxfp8(A_dq)
|
221
|
+
B_qu = per_block_quant_mxfp8(B_dq)
|
222
|
+
out = None
|
223
|
+
|
224
|
+
with torch.inference_mode():
|
225
|
+
ref_out = native_w8a8_block_fp8_matmul(
|
226
|
+
A_q, B_q, A_s, B_s, block_size, out_dtype
|
227
|
+
)
|
228
|
+
out = torch.empty_like(ref_out)
|
229
|
+
fp8_gemm_nt(A_qu, B_qu, out)
|
230
|
+
|
231
|
+
torch.testing.assert_close(out, ref_out, atol=1e-1, rtol=1e-2)
|
232
|
+
|
233
|
+
def test_deep_gemm_blackwell(self):
|
234
|
+
for params in itertools.product(
|
235
|
+
self.M,
|
236
|
+
self.NKs,
|
237
|
+
self.BLOCK_SIZE,
|
238
|
+
self.OUT_DTYPES,
|
239
|
+
self.SEEDS,
|
240
|
+
):
|
241
|
+
with self.subTest(
|
242
|
+
M=params[0],
|
243
|
+
NKs=params[1],
|
244
|
+
block_size=params[2],
|
245
|
+
out_dtype=params[3],
|
246
|
+
seed=params[4],
|
247
|
+
):
|
248
|
+
self._test_deep_gemm_blackwell(*params)
|
249
|
+
|
250
|
+
|
251
|
+
if __name__ == "__main__":
|
252
|
+
unittest.main(verbosity=2)
|
sglang/test/test_block_fp8_ep.py
CHANGED
sglang/test/test_utils.py
CHANGED
@@ -41,6 +41,8 @@ DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
|
41
41
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
42
42
|
|
43
43
|
# MLA test models
|
44
|
+
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
45
|
+
DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST = "cross-encoder/ms-marco-MiniLM-L6-v2"
|
44
46
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
45
47
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
46
48
|
DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
|
@@ -85,7 +87,7 @@ DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
|
|
85
87
|
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
86
88
|
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
87
89
|
|
88
|
-
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH =
|
90
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
89
91
|
|
90
92
|
|
91
93
|
def is_in_ci():
|
sglang/utils.py
CHANGED
@@ -512,3 +512,12 @@ async def async_stream_and_merge(llm, prompt, sampling_params):
|
|
512
512
|
cleaned_chunk = trim_overlap(final_text, chunk_text)
|
513
513
|
final_text += cleaned_chunk
|
514
514
|
yield cleaned_chunk # yield the non-overlapping portion
|
515
|
+
|
516
|
+
|
517
|
+
def resolve_obj_by_qualname(qualname: str) -> Any:
|
518
|
+
"""
|
519
|
+
Resolve an object by its fully qualified name.
|
520
|
+
"""
|
521
|
+
module_name, obj_name = qualname.rsplit(".", 1)
|
522
|
+
module = importlib.import_module(module_name)
|
523
|
+
return getattr(module, obj_name)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.7"
|
1
|
+
__version__ = "0.4.7.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.7
|
3
|
+
Version: 0.4.7.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -248,7 +248,7 @@ Requires-Dist: uvloop; extra == "runtime-common"
|
|
248
248
|
Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
|
249
249
|
Provides-Extra: srt
|
250
250
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
251
|
-
Requires-Dist: sgl-kernel==0.1.
|
251
|
+
Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
|
252
252
|
Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
|
253
253
|
Requires-Dist: torch==2.7.1; extra == "srt"
|
254
254
|
Requires-Dist: torchaudio==2.7.1; extra == "srt"
|
@@ -371,7 +371,7 @@ Dynamic: license-file
|
|
371
371
|
|
372
372
|
--------------------------------------------------------------------------------
|
373
373
|
|
374
|
-
| [**Blog**](https://lmsys.org/blog/
|
374
|
+
| [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
|
375
375
|
| [**Documentation**](https://docs.sglang.ai/)
|
376
376
|
| [**Join Slack**](https://slack.sglang.ai/)
|
377
377
|
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
@@ -403,7 +403,7 @@ SGLang is a fast serving framework for large language models and vision language
|
|
403
403
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
404
404
|
The core features include:
|
405
405
|
|
406
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching,
|
406
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
407
407
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
408
408
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
409
409
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
@@ -422,7 +422,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
422
422
|
[Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
|
423
423
|
|
424
424
|
## Adoption and Sponsorship
|
425
|
-
SGLang has been deployed at large scale,
|
425
|
+
SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
|
426
426
|
|
427
427
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
428
428
|
|