sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +6 -0
- sglang/bench_one_batch.py +1 -1
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +3 -1
- sglang/check_env.py +3 -4
- sglang/lang/backend/openai.py +18 -5
- sglang/lang/chat_template.py +28 -7
- sglang/lang/interpreter.py +7 -3
- sglang/lang/ir.py +10 -0
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/code_completion_parser.py +174 -0
- sglang/srt/configs/__init__.py +2 -6
- sglang/srt/configs/deepseekvl2.py +667 -0
- sglang/srt/configs/janus_pro.py +3 -4
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +63 -11
- sglang/srt/configs/utils.py +25 -0
- sglang/srt/connector/__init__.py +51 -0
- sglang/srt/connector/base_connector.py +112 -0
- sglang/srt/connector/redis.py +85 -0
- sglang/srt/connector/s3.py +122 -0
- sglang/srt/connector/serde/__init__.py +31 -0
- sglang/srt/connector/serde/safe_serde.py +29 -0
- sglang/srt/connector/serde/serde.py +43 -0
- sglang/srt/connector/utils.py +35 -0
- sglang/srt/conversation.py +88 -0
- sglang/srt/disaggregation/conn.py +81 -0
- sglang/srt/disaggregation/decode.py +495 -0
- sglang/srt/disaggregation/mini_lb.py +285 -0
- sglang/srt/disaggregation/prefill.py +249 -0
- sglang/srt/disaggregation/utils.py +44 -0
- sglang/srt/distributed/parallel_state.py +10 -3
- sglang/srt/entrypoints/engine.py +55 -5
- sglang/srt/entrypoints/http_server.py +71 -12
- sglang/srt/function_call_parser.py +133 -54
- sglang/srt/hf_transformers_utils.py +28 -3
- sglang/srt/layers/activation.py +4 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +295 -0
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashmla_backend.py +284 -0
- sglang/srt/layers/attention/triton_backend.py +171 -38
- sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
- sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
- sglang/srt/layers/attention/utils.py +53 -0
- sglang/srt/layers/attention/vision.py +9 -28
- sglang/srt/layers/dp_attention.py +32 -21
- sglang/srt/layers/layernorm.py +24 -2
- sglang/srt/layers/linear.py +17 -5
- sglang/srt/layers/logits_processor.py +25 -7
- sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
- sglang/srt/layers/moe/ep_moe/layer.py +273 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
- sglang/srt/layers/moe/fused_moe_native.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
- sglang/srt/layers/moe/topk.py +31 -18
- sglang/srt/layers/parameter.py +1 -1
- sglang/srt/layers/quantization/__init__.py +184 -126
- sglang/srt/layers/quantization/base_config.py +5 -0
- sglang/srt/layers/quantization/blockwise_int8.py +1 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
- sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
- sglang/srt/layers/quantization/fp8.py +76 -34
- sglang/srt/layers/quantization/fp8_kernel.py +24 -8
- sglang/srt/layers/quantization/fp8_utils.py +284 -28
- sglang/srt/layers/quantization/gptq.py +36 -9
- sglang/srt/layers/quantization/kv_cache.py +98 -0
- sglang/srt/layers/quantization/modelopt_quant.py +9 -7
- sglang/srt/layers/quantization/utils.py +153 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
- sglang/srt/layers/rotary_embedding.py +66 -87
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/layers.py +68 -0
- sglang/srt/lora/lora.py +2 -22
- sglang/srt/lora/lora_manager.py +47 -23
- sglang/srt/lora/mem_pool.py +110 -51
- sglang/srt/lora/utils.py +12 -1
- sglang/srt/managers/cache_controller.py +2 -5
- sglang/srt/managers/data_parallel_controller.py +30 -8
- sglang/srt/managers/expert_distribution.py +81 -0
- sglang/srt/managers/io_struct.py +39 -3
- sglang/srt/managers/mm_utils.py +373 -0
- sglang/srt/managers/multimodal_processor.py +68 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
- sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
- sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
- sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
- sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
- sglang/srt/managers/schedule_batch.py +133 -30
- sglang/srt/managers/scheduler.py +273 -20
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +59 -23
- sglang/srt/managers/tp_worker.py +1 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
- sglang/srt/managers/utils.py +6 -1
- sglang/srt/mem_cache/hiradix_cache.py +18 -7
- sglang/srt/mem_cache/memory_pool.py +255 -98
- sglang/srt/mem_cache/paged_allocator.py +2 -2
- sglang/srt/mem_cache/radix_cache.py +4 -4
- sglang/srt/model_executor/cuda_graph_runner.py +27 -13
- sglang/srt/model_executor/forward_batch_info.py +68 -11
- sglang/srt/model_executor/model_runner.py +70 -6
- sglang/srt/model_loader/loader.py +160 -2
- sglang/srt/model_loader/weight_utils.py +45 -0
- sglang/srt/models/deepseek_janus_pro.py +29 -86
- sglang/srt/models/deepseek_nextn.py +22 -10
- sglang/srt/models/deepseek_v2.py +208 -77
- sglang/srt/models/deepseek_vl2.py +358 -0
- sglang/srt/models/gemma3_causal.py +684 -0
- sglang/srt/models/gemma3_mm.py +462 -0
- sglang/srt/models/llama.py +47 -7
- sglang/srt/models/llama_eagle.py +1 -0
- sglang/srt/models/llama_eagle3.py +196 -0
- sglang/srt/models/llava.py +3 -3
- sglang/srt/models/llavavid.py +3 -3
- sglang/srt/models/minicpmo.py +1995 -0
- sglang/srt/models/minicpmv.py +62 -137
- sglang/srt/models/mllama.py +4 -4
- sglang/srt/models/phi3_small.py +1 -1
- sglang/srt/models/qwen2.py +3 -0
- sglang/srt/models/qwen2_5_vl.py +68 -146
- sglang/srt/models/qwen2_classification.py +75 -0
- sglang/srt/models/qwen2_moe.py +9 -1
- sglang/srt/models/qwen2_vl.py +25 -63
- sglang/srt/openai_api/adapter.py +124 -28
- sglang/srt/openai_api/protocol.py +23 -2
- sglang/srt/sampling/sampling_batch_info.py +1 -1
- sglang/srt/sampling/sampling_params.py +6 -6
- sglang/srt/server_args.py +99 -9
- sglang/srt/speculative/build_eagle_tree.py +7 -347
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
- sglang/srt/speculative/eagle_utils.py +208 -252
- sglang/srt/speculative/eagle_worker.py +139 -53
- sglang/srt/speculative/spec_info.py +6 -1
- sglang/srt/torch_memory_saver_adapter.py +22 -0
- sglang/srt/utils.py +182 -21
- sglang/test/__init__.py +0 -0
- sglang/test/attention/__init__.py +0 -0
- sglang/test/attention/test_flashattn_backend.py +312 -0
- sglang/test/runners.py +2 -0
- sglang/test/test_activation.py +2 -1
- sglang/test/test_block_fp8.py +5 -4
- sglang/test/test_block_fp8_ep.py +2 -1
- sglang/test/test_dynamic_grad_mode.py +58 -0
- sglang/test/test_layernorm.py +3 -2
- sglang/test/test_utils.py +55 -4
- sglang/utils.py +31 -0
- sglang/version.py +1 -1
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +167 -123
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
- sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
- sglang/srt/managers/image_processor.py +0 -55
- sglang/srt/managers/image_processors/base_image_processor.py +0 -219
- sglang/srt/managers/image_processors/minicpmv.py +0 -86
- sglang/srt/managers/multi_modality_padding.py +0 -134
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
sglang/test/test_block_fp8.py
CHANGED
@@ -11,6 +11,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
|
|
11
11
|
static_quant_fp8,
|
12
12
|
w8a8_block_fp8_matmul,
|
13
13
|
)
|
14
|
+
from sglang.test.test_utils import CustomTestCase
|
14
15
|
|
15
16
|
_is_cuda = torch.cuda.is_available() and torch.version.cuda
|
16
17
|
|
@@ -44,7 +45,7 @@ def native_per_token_group_quant_fp8(
|
|
44
45
|
return x_q, x_s
|
45
46
|
|
46
47
|
|
47
|
-
class TestPerTokenGroupQuantFP8(
|
48
|
+
class TestPerTokenGroupQuantFP8(CustomTestCase):
|
48
49
|
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
49
50
|
NUM_TOKENS = [7, 83, 2048]
|
50
51
|
D = [512, 4096, 5120, 13824]
|
@@ -111,7 +112,7 @@ def native_static_quant_fp8(x, x_s, dtype=torch.float8_e4m3fn):
|
|
111
112
|
return x_q, x_s
|
112
113
|
|
113
114
|
|
114
|
-
class TestStaticQuantFP8(
|
115
|
+
class TestStaticQuantFP8(CustomTestCase):
|
115
116
|
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
116
117
|
NUM_TOKENS = [7, 83, 2048]
|
117
118
|
D = [512, 4096, 5120, 13824]
|
@@ -210,7 +211,7 @@ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.fl
|
|
210
211
|
return C
|
211
212
|
|
212
213
|
|
213
|
-
class TestW8A8BlockFP8Matmul(
|
214
|
+
class TestW8A8BlockFP8Matmul(CustomTestCase):
|
214
215
|
|
215
216
|
if not _is_cuda:
|
216
217
|
OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
@@ -331,7 +332,7 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
|
|
331
332
|
).sum(dim=1)
|
332
333
|
|
333
334
|
|
334
|
-
class TestW8A8BlockFP8FusedMoE(
|
335
|
+
class TestW8A8BlockFP8FusedMoE(CustomTestCase):
|
335
336
|
DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
336
337
|
M = [1, 33, 64, 222, 1024 * 128]
|
337
338
|
N = [128, 1024, 2048]
|
sglang/test/test_block_fp8_ep.py
CHANGED
@@ -13,6 +13,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
|
|
13
13
|
silu_and_mul_triton_kernel,
|
14
14
|
)
|
15
15
|
from sglang.srt.layers.moe.topk import select_experts
|
16
|
+
from sglang.test.test_utils import CustomTestCase
|
16
17
|
|
17
18
|
|
18
19
|
# For test
|
@@ -232,7 +233,7 @@ def block_dequant(
|
|
232
233
|
return x_dq_block
|
233
234
|
|
234
235
|
|
235
|
-
class TestW8A8BlockFP8EPMoE(
|
236
|
+
class TestW8A8BlockFP8EPMoE(CustomTestCase):
|
236
237
|
DTYPES = [torch.half, torch.bfloat16]
|
237
238
|
M = [1, 222, 1024, 2048]
|
238
239
|
N = [128, 1024, 2048]
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
from sglang.srt.utils import DynamicGradMode
|
6
|
+
from sglang.test.test_utils import CustomTestCase
|
7
|
+
|
8
|
+
|
9
|
+
class TestDynamicGradMode(CustomTestCase):
|
10
|
+
def test_inference(self):
|
11
|
+
# Test inference_mode
|
12
|
+
DynamicGradMode.set_inference_mode(True)
|
13
|
+
|
14
|
+
@DynamicGradMode()
|
15
|
+
def create_tensor_x():
|
16
|
+
return torch.empty(0)
|
17
|
+
|
18
|
+
X = create_tensor_x()
|
19
|
+
self.assertTrue(not X.requires_grad and X.is_inference())
|
20
|
+
|
21
|
+
def test_no_grad(self):
|
22
|
+
# Test no_grad
|
23
|
+
DynamicGradMode.set_inference_mode(False)
|
24
|
+
|
25
|
+
@DynamicGradMode()
|
26
|
+
def create_tensor_y():
|
27
|
+
return torch.empty(0)
|
28
|
+
|
29
|
+
Y = create_tensor_y()
|
30
|
+
self.assertTrue(not Y.requires_grad and not Y.is_inference())
|
31
|
+
|
32
|
+
def test_nested_inference(self):
|
33
|
+
# Test no_grad nested inference_mode, inference_mode should has higher priority
|
34
|
+
DynamicGradMode.set_inference_mode(False)
|
35
|
+
|
36
|
+
@DynamicGradMode()
|
37
|
+
def create_tensor_z():
|
38
|
+
with torch.inference_mode():
|
39
|
+
return torch.empty(0)
|
40
|
+
|
41
|
+
Z = create_tensor_z()
|
42
|
+
self.assertTrue(not Z.requires_grad and Z.is_inference())
|
43
|
+
|
44
|
+
def test_nested_no_grad(self):
|
45
|
+
# Test inference_mode nested no_grad, inference_mode should has higher priority
|
46
|
+
DynamicGradMode.set_inference_mode(True)
|
47
|
+
|
48
|
+
@DynamicGradMode()
|
49
|
+
def create_tensor_w():
|
50
|
+
with torch.no_grad():
|
51
|
+
return torch.empty(0)
|
52
|
+
|
53
|
+
W = create_tensor_w()
|
54
|
+
self.assertTrue(not W.requires_grad and W.is_inference())
|
55
|
+
|
56
|
+
|
57
|
+
if __name__ == "__main__":
|
58
|
+
unittest.main(verbosity=2)
|
sglang/test/test_layernorm.py
CHANGED
@@ -4,9 +4,10 @@ import unittest
|
|
4
4
|
import torch
|
5
5
|
|
6
6
|
from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
|
7
|
+
from sglang.test.test_utils import CustomTestCase
|
7
8
|
|
8
9
|
|
9
|
-
class TestRMSNorm(
|
10
|
+
class TestRMSNorm(CustomTestCase):
|
10
11
|
DTYPES = [torch.half, torch.bfloat16]
|
11
12
|
NUM_TOKENS = [7, 83, 4096]
|
12
13
|
HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
|
@@ -56,7 +57,7 @@ class TestRMSNorm(unittest.TestCase):
|
|
56
57
|
self._run_rms_norm_test(*params)
|
57
58
|
|
58
59
|
|
59
|
-
class TestGemmaRMSNorm(
|
60
|
+
class TestGemmaRMSNorm(CustomTestCase):
|
60
61
|
DTYPES = [torch.half, torch.bfloat16]
|
61
62
|
NUM_TOKENS = [7, 83, 4096]
|
62
63
|
HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
|
sglang/test/test_utils.py
CHANGED
@@ -1,15 +1,17 @@
|
|
1
1
|
"""Common utilities for testing and benchmarking"""
|
2
2
|
|
3
3
|
import argparse
|
4
|
-
import asyncio
|
5
4
|
import copy
|
5
|
+
import logging
|
6
6
|
import os
|
7
7
|
import random
|
8
8
|
import subprocess
|
9
9
|
import threading
|
10
10
|
import time
|
11
|
+
import traceback
|
11
12
|
import unittest
|
12
13
|
from concurrent.futures import ThreadPoolExecutor
|
14
|
+
from dataclasses import dataclass
|
13
15
|
from functools import partial
|
14
16
|
from types import SimpleNamespace
|
15
17
|
from typing import Callable, List, Optional, Tuple
|
@@ -32,6 +34,15 @@ DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct
|
|
32
34
|
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
|
33
35
|
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
|
34
36
|
)
|
37
|
+
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
|
38
|
+
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
39
|
+
)
|
40
|
+
# TODO(yundai424): right now specifying to an older revision since the latest one
|
41
|
+
# carries kv cache quantization which doesn't work yet
|
42
|
+
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION = (
|
43
|
+
"13858565416dbdc0b4e7a4a677fadfbd5b9e5bb9"
|
44
|
+
)
|
45
|
+
|
35
46
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
36
47
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
37
48
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
@@ -52,7 +63,6 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
|
|
52
63
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
53
64
|
DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
|
54
65
|
|
55
|
-
|
56
66
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
57
67
|
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
58
68
|
|
@@ -423,6 +433,11 @@ def popen_launch_server(
|
|
423
433
|
return process
|
424
434
|
except requests.RequestException:
|
425
435
|
pass
|
436
|
+
|
437
|
+
return_code = process.poll()
|
438
|
+
if return_code is not None:
|
439
|
+
raise Exception(f"Server unexpectedly exits ({return_code=}).")
|
440
|
+
|
426
441
|
time.sleep(10)
|
427
442
|
|
428
443
|
kill_process_tree(process.pid)
|
@@ -453,7 +468,13 @@ def run_with_timeout(
|
|
453
468
|
return ret_value[0]
|
454
469
|
|
455
470
|
|
456
|
-
|
471
|
+
@dataclass
|
472
|
+
class TestFile:
|
473
|
+
name: str
|
474
|
+
estimated_time: float = 60
|
475
|
+
|
476
|
+
|
477
|
+
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
457
478
|
tic = time.time()
|
458
479
|
success = True
|
459
480
|
|
@@ -870,7 +891,6 @@ def run_mulit_request_test(
|
|
870
891
|
enable_overlap=False,
|
871
892
|
chunked_prefill_size=32,
|
872
893
|
):
|
873
|
-
|
874
894
|
def workload_func(base_url, model):
|
875
895
|
def run_one(_):
|
876
896
|
prompt = """
|
@@ -905,6 +925,10 @@ def run_mulit_request_test(
|
|
905
925
|
|
906
926
|
|
907
927
|
def write_github_step_summary(content):
|
928
|
+
if not os.environ.get("GITHUB_STEP_SUMMARY"):
|
929
|
+
logging.warning("GITHUB_STEP_SUMMARY environment variable not set")
|
930
|
+
return
|
931
|
+
|
908
932
|
with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
|
909
933
|
f.write(content)
|
910
934
|
|
@@ -982,3 +1006,30 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
|
|
982
1006
|
rank += 1
|
983
1007
|
else:
|
984
1008
|
raise
|
1009
|
+
|
1010
|
+
|
1011
|
+
class CustomTestCase(unittest.TestCase):
|
1012
|
+
def _callTestMethod(self, method):
|
1013
|
+
_retry_execution(
|
1014
|
+
lambda: super(CustomTestCase, self)._callTestMethod(method),
|
1015
|
+
max_retry=_get_max_retry(),
|
1016
|
+
)
|
1017
|
+
|
1018
|
+
|
1019
|
+
def _get_max_retry():
|
1020
|
+
return int(os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0"))
|
1021
|
+
|
1022
|
+
|
1023
|
+
def _retry_execution(fn, max_retry: int):
|
1024
|
+
if max_retry == 0:
|
1025
|
+
fn()
|
1026
|
+
return
|
1027
|
+
|
1028
|
+
try:
|
1029
|
+
fn()
|
1030
|
+
except Exception as e:
|
1031
|
+
print(
|
1032
|
+
f"retry_execution failed once and will retry. This may be an error or a flaky test. Error: {e}"
|
1033
|
+
)
|
1034
|
+
traceback.print_exc()
|
1035
|
+
_retry_execution(fn, max_retry=max_retry - 1)
|
sglang/utils.py
CHANGED
@@ -22,6 +22,7 @@ from typing import Any, Callable, List, Optional, Tuple, Type, Union
|
|
22
22
|
import numpy as np
|
23
23
|
import requests
|
24
24
|
from IPython.display import HTML, display
|
25
|
+
from pydantic import BaseModel
|
25
26
|
from tqdm import tqdm
|
26
27
|
|
27
28
|
from sglang.srt.utils import kill_process_tree
|
@@ -29,6 +30,36 @@ from sglang.srt.utils import kill_process_tree
|
|
29
30
|
logger = logging.getLogger(__name__)
|
30
31
|
|
31
32
|
|
33
|
+
def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str:
|
34
|
+
"""Convert a JSON schema to a string.
|
35
|
+
Parameters
|
36
|
+
----------
|
37
|
+
json_schema
|
38
|
+
The JSON schema.
|
39
|
+
Returns
|
40
|
+
-------
|
41
|
+
str
|
42
|
+
The JSON schema converted to a string.
|
43
|
+
Raises
|
44
|
+
------
|
45
|
+
ValueError
|
46
|
+
If the schema is not a dictionary, a string or a Pydantic class.
|
47
|
+
"""
|
48
|
+
if isinstance(json_schema, dict):
|
49
|
+
schema_str = json.dumps(json_schema)
|
50
|
+
elif isinstance(json_schema, str):
|
51
|
+
schema_str = json_schema
|
52
|
+
elif issubclass(json_schema, BaseModel):
|
53
|
+
schema_str = json.dumps(json_schema.model_json_schema())
|
54
|
+
else:
|
55
|
+
raise ValueError(
|
56
|
+
f"Cannot parse schema {json_schema}. The schema must be either "
|
57
|
+
+ "a Pydantic class, a dictionary or a string that contains the JSON "
|
58
|
+
+ "schema specification"
|
59
|
+
)
|
60
|
+
return schema_str
|
61
|
+
|
62
|
+
|
32
63
|
def get_exception_traceback():
|
33
64
|
etype, value, tb = sys.exc_info()
|
34
65
|
err_str = "".join(traceback.format_exception(etype, value, tb))
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.4.
|
1
|
+
__version__ = "0.4.4.post2"
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.4.
|
3
|
+
Version: 0.4.4.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -235,17 +235,17 @@ Requires-Dist: psutil; extra == "runtime-common"
|
|
235
235
|
Requires-Dist: pydantic; extra == "runtime-common"
|
236
236
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
237
237
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
238
|
+
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
238
239
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
239
|
-
Requires-Dist: transformers==4.
|
240
|
+
Requires-Dist: transformers==4.50.0; extra == "runtime-common"
|
240
241
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
241
242
|
Requires-Dist: uvloop; extra == "runtime-common"
|
242
|
-
Requires-Dist: xgrammar==0.1.
|
243
|
+
Requires-Dist: xgrammar==0.1.16; extra == "runtime-common"
|
243
244
|
Provides-Extra: srt
|
244
245
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
245
|
-
Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
|
246
|
+
Requires-Dist: sgl-kernel==0.0.5.post3; extra == "srt"
|
246
247
|
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
247
248
|
Requires-Dist: torch==2.5.1; extra == "srt"
|
248
|
-
Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
|
249
249
|
Requires-Dist: cuda-python; extra == "srt"
|
250
250
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
251
251
|
Provides-Extra: srt-hip
|
@@ -271,7 +271,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
|
271
271
|
Provides-Extra: litellm
|
272
272
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
273
273
|
Provides-Extra: torch-memory-saver
|
274
|
-
Requires-Dist: torch_memory_saver; extra == "torch-memory-saver"
|
274
|
+
Requires-Dist: torch_memory_saver>=0.0.3; extra == "torch-memory-saver"
|
275
275
|
Provides-Extra: test
|
276
276
|
Requires-Dist: jsonlines; extra == "test"
|
277
277
|
Requires-Dist: matplotlib; extra == "test"
|
@@ -319,6 +319,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
|
|
319
319
|
Provides-Extra: dev-cpu
|
320
320
|
Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
|
321
321
|
Requires-Dist: sglang[test]; extra == "dev-cpu"
|
322
|
+
Dynamic: license-file
|
322
323
|
|
323
324
|
<div align="center" id="sglangtop">
|
324
325
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -342,6 +343,9 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
|
|
342
343
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
343
344
|
|
344
345
|
## News
|
346
|
+
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
347
|
+
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
348
|
+
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
345
349
|
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
346
350
|
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
347
351
|
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -362,7 +366,7 @@ SGLang is a fast serving framework for large language models and vision language
|
|
362
366
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
363
367
|
The core features include:
|
364
368
|
|
365
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching,
|
369
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
|
366
370
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
367
371
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
368
372
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|