sglang 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -4
- sglang/bench_one_batch.py +23 -2
- sglang/bench_serving.py +6 -4
- sglang/lang/backend/anthropic.py +0 -4
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/openai.py +1 -1
- sglang/lang/backend/vertexai.py +0 -1
- sglang/lang/compiler.py +1 -7
- sglang/lang/tracer.py +3 -7
- sglang/srt/_custom_ops.py +0 -2
- sglang/srt/configs/model_config.py +37 -5
- sglang/srt/constrained/base_grammar_backend.py +26 -5
- sglang/srt/constrained/llguidance_backend.py +1 -0
- sglang/srt/constrained/outlines_backend.py +1 -0
- sglang/srt/constrained/outlines_jump_forward.py +14 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
- sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
- sglang/srt/constrained/xgrammar_backend.py +27 -4
- sglang/srt/custom_op.py +0 -62
- sglang/srt/disaggregation/base/__init__.py +8 -0
- sglang/srt/disaggregation/base/conn.py +113 -0
- sglang/srt/disaggregation/decode.py +80 -11
- sglang/srt/disaggregation/mini_lb.py +58 -123
- sglang/srt/disaggregation/mooncake/__init__.py +6 -0
- sglang/srt/disaggregation/mooncake/conn.py +585 -0
- sglang/srt/disaggregation/mooncake/transfer_engine.py +77 -0
- sglang/srt/disaggregation/prefill.py +82 -22
- sglang/srt/disaggregation/utils.py +46 -0
- sglang/srt/entrypoints/EngineBase.py +53 -0
- sglang/srt/entrypoints/engine.py +36 -8
- sglang/srt/entrypoints/http_server.py +37 -8
- sglang/srt/entrypoints/http_server_engine.py +142 -0
- sglang/srt/entrypoints/verl_engine.py +42 -13
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +6 -8
- sglang/srt/layers/attention/flashattention_backend.py +430 -257
- sglang/srt/layers/attention/flashinfer_backend.py +18 -9
- sglang/srt/layers/attention/torch_native_backend.py +6 -1
- sglang/srt/layers/attention/triton_backend.py +6 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/dp_attention.py +2 -4
- sglang/srt/layers/elementwise.py +15 -2
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +18 -3
- sglang/srt/layers/moe/ep_moe/layer.py +15 -29
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -34
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
- sglang/srt/layers/moe/router.py +7 -1
- sglang/srt/layers/moe/topk.py +63 -45
- sglang/srt/layers/parameter.py +0 -2
- sglang/srt/layers/quantization/__init__.py +13 -5
- sglang/srt/layers/quantization/blockwise_int8.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +12 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -77
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
- sglang/srt/layers/quantization/fp8.py +131 -136
- sglang/srt/layers/quantization/fp8_kernel.py +328 -46
- sglang/srt/layers/quantization/fp8_utils.py +206 -253
- sglang/srt/layers/quantization/kv_cache.py +43 -52
- sglang/srt/layers/quantization/modelopt_quant.py +271 -4
- sglang/srt/layers/quantization/moe_wna16.py +2 -0
- sglang/srt/layers/quantization/utils.py +5 -11
- sglang/srt/layers/quantization/w8a8_fp8.py +156 -4
- sglang/srt/layers/quantization/w8a8_int8.py +8 -7
- sglang/srt/layers/radix_attention.py +28 -1
- sglang/srt/layers/rotary_embedding.py +15 -3
- sglang/srt/layers/sampler.py +5 -10
- sglang/srt/lora/backend/base_backend.py +18 -2
- sglang/srt/lora/backend/flashinfer_backend.py +1 -1
- sglang/srt/lora/backend/triton_backend.py +1 -1
- sglang/srt/lora/layers.py +1 -1
- sglang/srt/lora/lora.py +1 -1
- sglang/srt/lora/lora_manager.py +1 -1
- sglang/srt/managers/detokenizer_manager.py +0 -1
- sglang/srt/managers/io_struct.py +255 -97
- sglang/srt/managers/mm_utils.py +7 -5
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +117 -79
- sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
- sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
- sglang/srt/managers/schedule_batch.py +64 -25
- sglang/srt/managers/scheduler.py +80 -82
- sglang/srt/managers/tokenizer_manager.py +18 -3
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +5 -1
- sglang/srt/mem_cache/memory_pool.py +21 -3
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +9 -6
- sglang/srt/model_executor/forward_batch_info.py +234 -15
- sglang/srt/model_executor/model_runner.py +67 -35
- sglang/srt/model_loader/loader.py +31 -4
- sglang/srt/model_loader/weight_utils.py +4 -2
- sglang/srt/models/baichuan.py +2 -0
- sglang/srt/models/bert.py +398 -0
- sglang/srt/models/chatglm.py +1 -0
- sglang/srt/models/commandr.py +1 -0
- sglang/srt/models/dbrx.py +1 -0
- sglang/srt/models/deepseek.py +2 -1
- sglang/srt/models/deepseek_nextn.py +74 -70
- sglang/srt/models/deepseek_v2.py +494 -366
- sglang/srt/models/exaone.py +1 -0
- sglang/srt/models/gemma.py +1 -0
- sglang/srt/models/gemma2.py +1 -0
- sglang/srt/models/gemma3_causal.py +1 -0
- sglang/srt/models/gpt2.py +1 -0
- sglang/srt/models/gpt_bigcode.py +1 -0
- sglang/srt/models/granite.py +1 -0
- sglang/srt/models/grok.py +1 -0
- sglang/srt/models/internlm2.py +1 -0
- sglang/srt/models/llama.py +6 -5
- sglang/srt/models/llama4.py +101 -34
- sglang/srt/models/minicpm.py +1 -0
- sglang/srt/models/minicpm3.py +30 -200
- sglang/srt/models/mixtral.py +1 -0
- sglang/srt/models/mixtral_quant.py +1 -0
- sglang/srt/models/mllama.py +51 -8
- sglang/srt/models/mllama4.py +102 -29
- sglang/srt/models/olmo.py +1 -0
- sglang/srt/models/olmo2.py +1 -0
- sglang/srt/models/olmoe.py +1 -0
- sglang/srt/models/phi3_small.py +1 -0
- sglang/srt/models/qwen.py +1 -0
- sglang/srt/models/qwen2.py +5 -1
- sglang/srt/models/qwen2_5_vl.py +35 -70
- sglang/srt/models/qwen2_moe.py +15 -13
- sglang/srt/models/qwen2_vl.py +27 -25
- sglang/srt/models/qwen3.py +335 -0
- sglang/srt/models/qwen3_moe.py +423 -0
- sglang/srt/models/stablelm.py +1 -0
- sglang/srt/models/xverse.py +1 -0
- sglang/srt/models/xverse_moe.py +1 -0
- sglang/srt/openai_api/adapter.py +4 -1
- sglang/srt/patch_torch.py +11 -0
- sglang/srt/reasoning_parser.py +0 -1
- sglang/srt/sampling/sampling_batch_info.py +2 -3
- sglang/srt/server_args.py +55 -19
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
- sglang/srt/speculative/eagle_utils.py +1 -11
- sglang/srt/speculative/eagle_worker.py +10 -9
- sglang/srt/utils.py +136 -10
- sglang/test/attention/test_flashattn_backend.py +259 -221
- sglang/test/attention/test_flashattn_mla_backend.py +285 -0
- sglang/test/attention/test_prefix_chunk_info.py +224 -0
- sglang/test/runners.py +5 -1
- sglang/test/test_block_fp8.py +224 -0
- sglang/test/test_custom_ops.py +1 -1
- sglang/test/test_utils.py +19 -8
- sglang/version.py +1 -1
- {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +15 -5
- {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +162 -147
- {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
- sglang/lang/__init__.py +0 -0
- sglang/srt/disaggregation/conn.py +0 -81
- sglang/srt/lora/backend/__init__.py +0 -25
- sglang/srt/server.py +0 -18
- {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0
sglang/test/test_block_fp8.py
CHANGED
@@ -7,10 +7,13 @@ import torch
|
|
7
7
|
from sglang.srt.layers.activation import SiluAndMul
|
8
8
|
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
|
9
9
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
10
|
+
per_tensor_quant_mla_deep_gemm_masked_fp8,
|
11
|
+
per_tensor_quant_mla_fp8,
|
10
12
|
per_token_group_quant_fp8,
|
11
13
|
static_quant_fp8,
|
12
14
|
w8a8_block_fp8_matmul,
|
13
15
|
)
|
16
|
+
from sglang.srt.layers.quantization.fp8_utils import input_to_float8
|
14
17
|
from sglang.test.test_utils import CustomTestCase
|
15
18
|
|
16
19
|
_is_cuda = torch.cuda.is_available() and torch.version.cuda
|
@@ -155,6 +158,117 @@ class TestStaticQuantFP8(CustomTestCase):
|
|
155
158
|
self._static_quant_fp8(*params)
|
156
159
|
|
157
160
|
|
161
|
+
class TestPerTensorQuantMlaFP8(CustomTestCase):
|
162
|
+
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
163
|
+
NUM_TOKENS = [7, 83, 2048]
|
164
|
+
D = [512, 4096, 5120, 13824]
|
165
|
+
LAST_D_EXT = [1024, 0]
|
166
|
+
LAST_D = [512]
|
167
|
+
SEEDS = [0]
|
168
|
+
|
169
|
+
@classmethod
|
170
|
+
def setUpClass(cls):
|
171
|
+
if not torch.cuda.is_available():
|
172
|
+
raise unittest.SkipTest("CUDA is not available")
|
173
|
+
torch.set_default_device("cuda")
|
174
|
+
|
175
|
+
def _per_tensor_quant_mla_fp8(self, num_tokens, d, last_d_ext, last_d, dtype, seed):
|
176
|
+
torch.manual_seed(seed)
|
177
|
+
|
178
|
+
x = torch.rand(
|
179
|
+
(num_tokens, d // last_d, last_d + last_d_ext),
|
180
|
+
dtype=dtype,
|
181
|
+
)
|
182
|
+
x_sub, _ = x.split([last_d, last_d_ext], dim=-1)
|
183
|
+
|
184
|
+
with torch.inference_mode():
|
185
|
+
ref_out, ref_s = input_to_float8(x_sub.transpose(0, 1))
|
186
|
+
out, out_s = per_tensor_quant_mla_fp8(x_sub.transpose(0, 1))
|
187
|
+
|
188
|
+
self.assertTrue(out.is_contiguous())
|
189
|
+
self.assertTrue(
|
190
|
+
torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.50)
|
191
|
+
)
|
192
|
+
self.assertTrue(
|
193
|
+
torch.allclose(out_s.to(torch.float32), ref_s.to(torch.float32))
|
194
|
+
)
|
195
|
+
|
196
|
+
def test_per_tensor_quant_mla_fp8(self):
|
197
|
+
for params in itertools.product(
|
198
|
+
self.NUM_TOKENS,
|
199
|
+
self.D,
|
200
|
+
self.LAST_D_EXT,
|
201
|
+
self.LAST_D,
|
202
|
+
self.DTYPES,
|
203
|
+
self.SEEDS,
|
204
|
+
):
|
205
|
+
with self.subTest(
|
206
|
+
num_tokens=params[0],
|
207
|
+
d=params[1],
|
208
|
+
last_d_ext=params[2],
|
209
|
+
last_d=params[3],
|
210
|
+
dtype=params[4],
|
211
|
+
seed=params[5],
|
212
|
+
):
|
213
|
+
self._per_tensor_quant_mla_fp8(*params)
|
214
|
+
|
215
|
+
|
216
|
+
class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
|
217
|
+
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
218
|
+
B = [128]
|
219
|
+
NUM_TOKENS = [7, 83, 2048, 1024 * 16]
|
220
|
+
D = [512, 128]
|
221
|
+
GROUP_SIZE = [128]
|
222
|
+
SEEDS = [0]
|
223
|
+
|
224
|
+
@classmethod
|
225
|
+
def setUpClass(cls):
|
226
|
+
if not torch.cuda.is_available():
|
227
|
+
raise unittest.SkipTest("CUDA is not available")
|
228
|
+
torch.set_default_device("cuda")
|
229
|
+
|
230
|
+
def _per_token_group_quant_mla_deep_gemm_masked_fp8(
|
231
|
+
self, b, num_tokens, d, dtype, group_size, seed
|
232
|
+
):
|
233
|
+
torch.manual_seed(seed)
|
234
|
+
|
235
|
+
x = torch.rand(b, num_tokens, d, dtype=dtype)
|
236
|
+
|
237
|
+
with torch.inference_mode():
|
238
|
+
ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
|
239
|
+
out, scale, _, _, _ = per_tensor_quant_mla_deep_gemm_masked_fp8(
|
240
|
+
x, group_size
|
241
|
+
)
|
242
|
+
out = out[:, :num_tokens, :]
|
243
|
+
scale = scale[:, :num_tokens, :]
|
244
|
+
|
245
|
+
self.assertTrue(
|
246
|
+
torch.allclose(
|
247
|
+
out.to(torch.float32), ref_out.to(torch.float32), rtol=0.20, atol=1e-2
|
248
|
+
)
|
249
|
+
)
|
250
|
+
self.assertTrue(torch.allclose(scale, ref_scale))
|
251
|
+
|
252
|
+
def test_per_token_group_quant_mla_deep_gemm_masked_fp8(self):
|
253
|
+
for params in itertools.product(
|
254
|
+
self.B,
|
255
|
+
self.NUM_TOKENS,
|
256
|
+
self.D,
|
257
|
+
self.DTYPES,
|
258
|
+
self.GROUP_SIZE,
|
259
|
+
self.SEEDS,
|
260
|
+
):
|
261
|
+
with self.subTest(
|
262
|
+
b=params[0],
|
263
|
+
num_tokens=params[1],
|
264
|
+
d=params[2],
|
265
|
+
dtype=params[3],
|
266
|
+
group_size=params[4],
|
267
|
+
seed=params[5],
|
268
|
+
):
|
269
|
+
self._per_token_group_quant_mla_deep_gemm_masked_fp8(*params)
|
270
|
+
|
271
|
+
|
158
272
|
# For test
|
159
273
|
def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
|
160
274
|
"""This function performs matrix multiplication with block-wise quantization using native torch.
|
@@ -428,5 +542,115 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
|
|
428
542
|
self._w8a8_block_fp8_fused_moe(*params)
|
429
543
|
|
430
544
|
|
545
|
+
# For test
|
546
|
+
def torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_shape, out_dtype):
|
547
|
+
"""This function performs bmm with block-wise quantization using native torch."""
|
548
|
+
|
549
|
+
B, N, _ = w.shape
|
550
|
+
_, M, _ = a.shape
|
551
|
+
out = torch.empty((B, M, N), dtype=out_dtype, device=a.device)
|
552
|
+
|
553
|
+
for i in range(B):
|
554
|
+
out[i] = native_w8a8_block_fp8_matmul(
|
555
|
+
a[i], w[i], a_s[i], w_s[i], block_shape, output_dtype=out_dtype
|
556
|
+
)
|
557
|
+
|
558
|
+
return out
|
559
|
+
|
560
|
+
|
561
|
+
class TestW8A8BlockFP8BatchedDeepGemm(CustomTestCase):
|
562
|
+
DTYPES = [torch.bfloat16]
|
563
|
+
M = [1, 33, 64, 222, 8192]
|
564
|
+
N = [128, 512]
|
565
|
+
K = [128, 512]
|
566
|
+
BATCH = [128]
|
567
|
+
BLOCK_SIZE = [[128, 128]]
|
568
|
+
SEEDS = [0]
|
569
|
+
|
570
|
+
@classmethod
|
571
|
+
def setUpClass(cls):
|
572
|
+
if not torch.cuda.is_available():
|
573
|
+
raise unittest.SkipTest("CUDA is not available")
|
574
|
+
try:
|
575
|
+
import deep_gemm
|
576
|
+
except ImportError:
|
577
|
+
raise unittest.SkipTest("DeepGEMM is not available")
|
578
|
+
torch.set_default_device("cuda")
|
579
|
+
|
580
|
+
def _w8a8_block_fp8_batched_deep_gemm(self, M, N, K, B, block_size, dtype, seed):
|
581
|
+
torch.manual_seed(seed)
|
582
|
+
factor_for_scale = 1e-2
|
583
|
+
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
584
|
+
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
585
|
+
|
586
|
+
a_fp32 = torch.randn((B, M, K), dtype=torch.float32) / 10
|
587
|
+
a = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
588
|
+
|
589
|
+
w_fp32 = (torch.rand((B, N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
|
590
|
+
w = w_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
591
|
+
|
592
|
+
block_n, block_k = block_size[0], block_size[1]
|
593
|
+
n_tiles_w = (N + block_n - 1) // block_n
|
594
|
+
k_tiles_w = (K + block_k - 1) // block_k
|
595
|
+
|
596
|
+
w_s = (
|
597
|
+
torch.rand((B, n_tiles_w, k_tiles_w), dtype=torch.float32)
|
598
|
+
* factor_for_scale
|
599
|
+
)
|
600
|
+
a_s = torch.rand((B, M, k_tiles_w), dtype=torch.float32) * factor_for_scale
|
601
|
+
|
602
|
+
ae = a.new_empty(B, (M + 255) // 256 * 256, K)
|
603
|
+
ae_s = a_s.new_empty(B, (M + 255) // 256 * 256, k_tiles_w)
|
604
|
+
oe = torch.empty((B, (M + 255) // 256 * 256, N), dtype=dtype)
|
605
|
+
ae[:, :M, :] = a
|
606
|
+
ae_s[:, :M, :] = a_s
|
607
|
+
|
608
|
+
masked_m = torch.full((B,), M, dtype=torch.int)
|
609
|
+
expected_m = M
|
610
|
+
lhs = (
|
611
|
+
ae,
|
612
|
+
ae_s,
|
613
|
+
)
|
614
|
+
rhs = (
|
615
|
+
w,
|
616
|
+
w_s,
|
617
|
+
)
|
618
|
+
|
619
|
+
from deep_gemm import m_grouped_gemm_fp8_fp8_bf16_nt_masked
|
620
|
+
|
621
|
+
with torch.inference_mode():
|
622
|
+
ref_out = torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_size, dtype)
|
623
|
+
m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs, rhs, oe, masked_m, expected_m)
|
624
|
+
out = oe[:, :M, :]
|
625
|
+
|
626
|
+
self.assertTrue(
|
627
|
+
torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
|
628
|
+
/ torch.mean(torch.abs(ref_out.to(torch.float32)))
|
629
|
+
< 0.0001
|
630
|
+
)
|
631
|
+
|
632
|
+
def test_w8a8_block_fp8_batched_deep_gemm(self):
|
633
|
+
|
634
|
+
for params in itertools.product(
|
635
|
+
self.M,
|
636
|
+
self.N,
|
637
|
+
self.K,
|
638
|
+
self.BATCH,
|
639
|
+
self.BLOCK_SIZE,
|
640
|
+
self.DTYPES,
|
641
|
+
self.SEEDS,
|
642
|
+
):
|
643
|
+
with self.subTest(
|
644
|
+
M=params[0],
|
645
|
+
N=params[1],
|
646
|
+
K=params[2],
|
647
|
+
B=params[3],
|
648
|
+
block_size=params[4],
|
649
|
+
dtype=params[5],
|
650
|
+
seed=params[6],
|
651
|
+
):
|
652
|
+
self._w8a8_block_fp8_batched_deep_gemm(*params)
|
653
|
+
|
654
|
+
|
431
655
|
if __name__ == "__main__":
|
432
656
|
unittest.main(verbosity=2)
|
sglang/test/test_custom_ops.py
CHANGED
sglang/test/test_utils.py
CHANGED
@@ -25,7 +25,12 @@ from sglang.bench_serving import run_benchmark
|
|
25
25
|
from sglang.global_config import global_config
|
26
26
|
from sglang.lang.backend.openai import OpenAI
|
27
27
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
28
|
-
from sglang.srt.utils import
|
28
|
+
from sglang.srt.utils import (
|
29
|
+
get_bool_env_var,
|
30
|
+
is_port_available,
|
31
|
+
kill_process_tree,
|
32
|
+
retry,
|
33
|
+
)
|
29
34
|
from sglang.test.run_eval import run_eval
|
30
35
|
from sglang.utils import get_exception_traceback
|
31
36
|
|
@@ -37,11 +42,6 @@ DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
|
|
37
42
|
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
|
38
43
|
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
39
44
|
)
|
40
|
-
# TODO(yundai424): right now specifying to an older revision since the latest one
|
41
|
-
# carries kv cache quantization which doesn't work yet
|
42
|
-
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION = (
|
43
|
-
"13858565416dbdc0b4e7a4a677fadfbd5b9e5bb9"
|
44
|
-
)
|
45
45
|
|
46
46
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
47
47
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
@@ -103,6 +103,17 @@ def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None)
|
|
103
103
|
return pred
|
104
104
|
|
105
105
|
|
106
|
+
def find_available_port(base_port: int):
|
107
|
+
port = base_port + random.randint(100, 1000)
|
108
|
+
while True:
|
109
|
+
if is_port_available(port):
|
110
|
+
return port
|
111
|
+
if port < 60000:
|
112
|
+
port += 42
|
113
|
+
else:
|
114
|
+
port -= 43
|
115
|
+
|
116
|
+
|
106
117
|
def call_generate_vllm(prompt, temperature, max_tokens, stop=None, n=1, url=None):
|
107
118
|
assert url is not None
|
108
119
|
|
@@ -674,8 +685,6 @@ def run_bench_one_batch(model, other_args):
|
|
674
685
|
"python3",
|
675
686
|
"-m",
|
676
687
|
"sglang.bench_one_batch",
|
677
|
-
"--model-path",
|
678
|
-
model,
|
679
688
|
"--batch-size",
|
680
689
|
"1",
|
681
690
|
"--input",
|
@@ -684,6 +693,8 @@ def run_bench_one_batch(model, other_args):
|
|
684
693
|
"8",
|
685
694
|
*[str(x) for x in other_args],
|
686
695
|
]
|
696
|
+
if model is not None:
|
697
|
+
command += ["--model-path", model]
|
687
698
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
688
699
|
|
689
700
|
try:
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.5"
|
1
|
+
__version__ = "0.4.5.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.5
|
3
|
+
Version: 0.4.5.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -239,20 +239,30 @@ Requires-Dist: python-multipart; extra == "runtime-common"
|
|
239
239
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
240
240
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
241
241
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
242
|
-
Requires-Dist: transformers==4.51.
|
242
|
+
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
243
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
244
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
245
|
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
246
246
|
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
247
247
|
Provides-Extra: srt
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
249
|
-
Requires-Dist: sgl-kernel==0.0.
|
249
|
+
Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
|
250
250
|
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
251
251
|
Requires-Dist: torch==2.5.1; extra == "srt"
|
252
|
+
Requires-Dist: torchvision==0.20.1; extra == "srt"
|
252
253
|
Requires-Dist: cuda-python; extra == "srt"
|
253
254
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
254
255
|
Requires-Dist: partial_json_parser; extra == "srt"
|
255
256
|
Requires-Dist: einops; extra == "srt"
|
257
|
+
Provides-Extra: blackwell
|
258
|
+
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
259
|
+
Requires-Dist: sgl-kernel; extra == "blackwell"
|
260
|
+
Requires-Dist: torch; extra == "blackwell"
|
261
|
+
Requires-Dist: torchvision; extra == "blackwell"
|
262
|
+
Requires-Dist: cuda-python; extra == "blackwell"
|
263
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
|
264
|
+
Requires-Dist: partial_json_parser; extra == "blackwell"
|
265
|
+
Requires-Dist: einops; extra == "blackwell"
|
256
266
|
Provides-Extra: srt-hip
|
257
267
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
258
268
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -371,7 +381,7 @@ SGLang is a fast serving framework for large language models and vision language
|
|
371
381
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
372
382
|
The core features include:
|
373
383
|
|
374
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs,
|
384
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
375
385
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
376
386
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
377
387
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
@@ -391,7 +401,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
391
401
|
|
392
402
|
## Adoption and Sponsorship
|
393
403
|
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
394
|
-
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
404
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
395
405
|
|
396
406
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
397
407
|
|