sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +6 -0
- sglang/bench_one_batch.py +1 -1
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +26 -4
- sglang/check_env.py +3 -4
- sglang/lang/backend/openai.py +18 -5
- sglang/lang/chat_template.py +28 -7
- sglang/lang/interpreter.py +7 -3
- sglang/lang/ir.py +10 -0
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/code_completion_parser.py +174 -0
- sglang/srt/configs/__init__.py +2 -6
- sglang/srt/configs/deepseekvl2.py +676 -0
- sglang/srt/configs/janus_pro.py +3 -4
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +49 -8
- sglang/srt/configs/utils.py +25 -0
- sglang/srt/connector/__init__.py +51 -0
- sglang/srt/connector/base_connector.py +112 -0
- sglang/srt/connector/redis.py +85 -0
- sglang/srt/connector/s3.py +122 -0
- sglang/srt/connector/serde/__init__.py +31 -0
- sglang/srt/connector/serde/safe_serde.py +29 -0
- sglang/srt/connector/serde/serde.py +43 -0
- sglang/srt/connector/utils.py +35 -0
- sglang/srt/conversation.py +88 -0
- sglang/srt/disaggregation/conn.py +81 -0
- sglang/srt/disaggregation/decode.py +495 -0
- sglang/srt/disaggregation/mini_lb.py +285 -0
- sglang/srt/disaggregation/prefill.py +249 -0
- sglang/srt/disaggregation/utils.py +44 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/parallel_state.py +42 -8
- sglang/srt/entrypoints/engine.py +55 -5
- sglang/srt/entrypoints/http_server.py +78 -13
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +133 -55
- sglang/srt/hf_transformers_utils.py +28 -3
- sglang/srt/layers/activation.py +4 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +434 -0
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashmla_backend.py +284 -0
- sglang/srt/layers/attention/triton_backend.py +171 -38
- sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
- sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
- sglang/srt/layers/attention/utils.py +53 -0
- sglang/srt/layers/attention/vision.py +9 -28
- sglang/srt/layers/dp_attention.py +41 -19
- sglang/srt/layers/layernorm.py +24 -2
- sglang/srt/layers/linear.py +17 -5
- sglang/srt/layers/logits_processor.py +25 -7
- sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
- sglang/srt/layers/moe/ep_moe/layer.py +273 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
- sglang/srt/layers/moe/fused_moe_native.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
- sglang/srt/layers/moe/topk.py +60 -20
- sglang/srt/layers/parameter.py +1 -1
- sglang/srt/layers/quantization/__init__.py +80 -53
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/base_config.py +5 -0
- sglang/srt/layers/quantization/blockwise_int8.py +1 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
- sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
- sglang/srt/layers/quantization/fp8.py +76 -34
- sglang/srt/layers/quantization/fp8_kernel.py +25 -8
- sglang/srt/layers/quantization/fp8_utils.py +284 -28
- sglang/srt/layers/quantization/gptq.py +36 -19
- sglang/srt/layers/quantization/kv_cache.py +98 -0
- sglang/srt/layers/quantization/modelopt_quant.py +9 -7
- sglang/srt/layers/quantization/utils.py +153 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
- sglang/srt/layers/rotary_embedding.py +78 -87
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +87 -33
- sglang/srt/lora/lora.py +2 -22
- sglang/srt/lora/lora_manager.py +67 -30
- sglang/srt/lora/mem_pool.py +117 -52
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +18 -1
- sglang/srt/managers/cache_controller.py +2 -5
- sglang/srt/managers/data_parallel_controller.py +30 -8
- sglang/srt/managers/expert_distribution.py +81 -0
- sglang/srt/managers/io_struct.py +43 -5
- sglang/srt/managers/mm_utils.py +373 -0
- sglang/srt/managers/multimodal_processor.py +68 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
- sglang/srt/managers/multimodal_processors/clip.py +63 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
- sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
- sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
- sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
- sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
- sglang/srt/managers/schedule_batch.py +134 -30
- sglang/srt/managers/scheduler.py +290 -31
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +59 -24
- sglang/srt/managers/tp_worker.py +4 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
- sglang/srt/managers/utils.py +6 -1
- sglang/srt/mem_cache/hiradix_cache.py +18 -7
- sglang/srt/mem_cache/memory_pool.py +255 -98
- sglang/srt/mem_cache/paged_allocator.py +2 -2
- sglang/srt/mem_cache/radix_cache.py +4 -4
- sglang/srt/model_executor/cuda_graph_runner.py +36 -21
- sglang/srt/model_executor/forward_batch_info.py +68 -11
- sglang/srt/model_executor/model_runner.py +75 -8
- sglang/srt/model_loader/loader.py +171 -3
- sglang/srt/model_loader/weight_utils.py +51 -3
- sglang/srt/models/clip.py +563 -0
- sglang/srt/models/deepseek_janus_pro.py +31 -88
- sglang/srt/models/deepseek_nextn.py +22 -10
- sglang/srt/models/deepseek_v2.py +329 -73
- sglang/srt/models/deepseek_vl2.py +358 -0
- sglang/srt/models/gemma3_causal.py +694 -0
- sglang/srt/models/gemma3_mm.py +468 -0
- sglang/srt/models/llama.py +47 -7
- sglang/srt/models/llama_eagle.py +1 -0
- sglang/srt/models/llama_eagle3.py +196 -0
- sglang/srt/models/llava.py +3 -3
- sglang/srt/models/llavavid.py +3 -3
- sglang/srt/models/minicpmo.py +1995 -0
- sglang/srt/models/minicpmv.py +62 -137
- sglang/srt/models/mllama.py +4 -4
- sglang/srt/models/phi3_small.py +1 -1
- sglang/srt/models/qwen2.py +3 -0
- sglang/srt/models/qwen2_5_vl.py +68 -146
- sglang/srt/models/qwen2_classification.py +75 -0
- sglang/srt/models/qwen2_moe.py +9 -1
- sglang/srt/models/qwen2_vl.py +25 -63
- sglang/srt/openai_api/adapter.py +201 -104
- sglang/srt/openai_api/protocol.py +33 -7
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/sampling/sampling_batch_info.py +1 -1
- sglang/srt/sampling/sampling_params.py +6 -6
- sglang/srt/server_args.py +114 -14
- sglang/srt/speculative/build_eagle_tree.py +7 -347
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
- sglang/srt/speculative/eagle_utils.py +208 -252
- sglang/srt/speculative/eagle_worker.py +140 -54
- sglang/srt/speculative/spec_info.py +6 -1
- sglang/srt/torch_memory_saver_adapter.py +22 -0
- sglang/srt/utils.py +215 -21
- sglang/test/__init__.py +0 -0
- sglang/test/attention/__init__.py +0 -0
- sglang/test/attention/test_flashattn_backend.py +312 -0
- sglang/test/runners.py +29 -2
- sglang/test/test_activation.py +2 -1
- sglang/test/test_block_fp8.py +5 -4
- sglang/test/test_block_fp8_ep.py +2 -1
- sglang/test/test_dynamic_grad_mode.py +58 -0
- sglang/test/test_layernorm.py +3 -2
- sglang/test/test_utils.py +56 -5
- sglang/utils.py +31 -0
- sglang/version.py +1 -1
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
- sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
- sglang/srt/managers/image_processor.py +0 -55
- sglang/srt/managers/image_processors/base_image_processor.py +0 -219
- sglang/srt/managers/image_processors/minicpmv.py +0 -86
- sglang/srt/managers/multi_modality_padding.py +0 -134
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0
sglang/test/runners.py
CHANGED
@@ -19,10 +19,16 @@ from typing import List, Optional, Tuple, Union
|
|
19
19
|
|
20
20
|
import torch
|
21
21
|
import torch.nn.functional as F
|
22
|
-
from transformers import
|
22
|
+
from transformers import (
|
23
|
+
AutoModel,
|
24
|
+
AutoModelForCausalLM,
|
25
|
+
AutoModelForVision2Seq,
|
26
|
+
AutoProcessor,
|
27
|
+
)
|
23
28
|
|
24
29
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
25
30
|
from sglang.srt.server import Engine
|
31
|
+
from sglang.srt.utils import load_image
|
26
32
|
from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
|
27
33
|
|
28
34
|
DEFAULT_PROMPTS = [
|
@@ -140,7 +146,6 @@ class HFRunner:
|
|
140
146
|
def _get_gme_qwen2_vl_embeddings(
|
141
147
|
self, prompts, image_data: Optional[List[str]] = None
|
142
148
|
):
|
143
|
-
from sglang.srt.utils import load_image
|
144
149
|
|
145
150
|
images = None
|
146
151
|
if image_data is not None:
|
@@ -226,6 +231,9 @@ class HFRunner:
|
|
226
231
|
low_cpu_mem_usage=True,
|
227
232
|
).cuda()
|
228
233
|
self.processor = AutoProcessor.from_pretrained(model_path)
|
234
|
+
elif "clip" in model_path.lower():
|
235
|
+
self.model = AutoModel.from_pretrained(model_path).cuda()
|
236
|
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
229
237
|
else:
|
230
238
|
self.model = _get_sentence_transformer_embedding_model(
|
231
239
|
model_path, torch_dtype
|
@@ -272,6 +280,23 @@ class HFRunner:
|
|
272
280
|
assert not self.output_str_only
|
273
281
|
if "gme-qwen2-vl" in model_path.lower():
|
274
282
|
logits = self._get_gme_qwen2_vl_embeddings(prompts, image_data)
|
283
|
+
elif "clip" in model_path.lower():
|
284
|
+
if image_data is not None:
|
285
|
+
image = load_image(image_data)
|
286
|
+
inputs = self.processor(
|
287
|
+
images=image[0], return_tensors="pt"
|
288
|
+
)
|
289
|
+
logits = self.model.get_image_features(
|
290
|
+
pixel_values=inputs.data["pixel_values"].cuda(),
|
291
|
+
).tolist()
|
292
|
+
else:
|
293
|
+
inputs = self.tokenizer(
|
294
|
+
prompts, padding=True, return_tensors="pt"
|
295
|
+
)
|
296
|
+
logits = self.model.get_text_features(
|
297
|
+
input_ids=inputs.data["input_ids"].cuda(),
|
298
|
+
attention_mask=inputs.data["attention_mask"].cuda(),
|
299
|
+
).tolist()
|
275
300
|
else:
|
276
301
|
logits = self.model.encode(prompts).tolist()
|
277
302
|
out_queue.put(ModelOutput(embed_logits=logits))
|
@@ -437,6 +462,7 @@ class SRTRunner:
|
|
437
462
|
speculative_eagle_topk: Optional[int] = None,
|
438
463
|
speculative_num_draft_tokens: Optional[int] = None,
|
439
464
|
disable_overlap_schedule: bool = False,
|
465
|
+
disable_custom_all_reduce: bool = False,
|
440
466
|
):
|
441
467
|
self.model_type = model_type
|
442
468
|
self.is_generation = model_type == "generation"
|
@@ -470,6 +496,7 @@ class SRTRunner:
|
|
470
496
|
enable_ep_moe=enable_ep_moe,
|
471
497
|
disable_overlap_schedule=disable_overlap_schedule,
|
472
498
|
cuda_graph_max_bs=4,
|
499
|
+
disable_custom_all_reduce=disable_custom_all_reduce,
|
473
500
|
**spec_kwargs,
|
474
501
|
)
|
475
502
|
|
sglang/test/test_activation.py
CHANGED
@@ -4,9 +4,10 @@ import unittest
|
|
4
4
|
import torch
|
5
5
|
|
6
6
|
from sglang.srt.layers.activation import GeluAndMul
|
7
|
+
from sglang.test.test_utils import CustomTestCase
|
7
8
|
|
8
9
|
|
9
|
-
class TestGeluAndMul(
|
10
|
+
class TestGeluAndMul(CustomTestCase):
|
10
11
|
DTYPES = [torch.half, torch.bfloat16]
|
11
12
|
NUM_TOKENS = [7, 83, 2048]
|
12
13
|
D = [512, 4096, 5120, 13824]
|
sglang/test/test_block_fp8.py
CHANGED
@@ -11,6 +11,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
|
|
11
11
|
static_quant_fp8,
|
12
12
|
w8a8_block_fp8_matmul,
|
13
13
|
)
|
14
|
+
from sglang.test.test_utils import CustomTestCase
|
14
15
|
|
15
16
|
_is_cuda = torch.cuda.is_available() and torch.version.cuda
|
16
17
|
|
@@ -44,7 +45,7 @@ def native_per_token_group_quant_fp8(
|
|
44
45
|
return x_q, x_s
|
45
46
|
|
46
47
|
|
47
|
-
class TestPerTokenGroupQuantFP8(
|
48
|
+
class TestPerTokenGroupQuantFP8(CustomTestCase):
|
48
49
|
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
49
50
|
NUM_TOKENS = [7, 83, 2048]
|
50
51
|
D = [512, 4096, 5120, 13824]
|
@@ -111,7 +112,7 @@ def native_static_quant_fp8(x, x_s, dtype=torch.float8_e4m3fn):
|
|
111
112
|
return x_q, x_s
|
112
113
|
|
113
114
|
|
114
|
-
class TestStaticQuantFP8(
|
115
|
+
class TestStaticQuantFP8(CustomTestCase):
|
115
116
|
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
116
117
|
NUM_TOKENS = [7, 83, 2048]
|
117
118
|
D = [512, 4096, 5120, 13824]
|
@@ -210,7 +211,7 @@ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.fl
|
|
210
211
|
return C
|
211
212
|
|
212
213
|
|
213
|
-
class TestW8A8BlockFP8Matmul(
|
214
|
+
class TestW8A8BlockFP8Matmul(CustomTestCase):
|
214
215
|
|
215
216
|
if not _is_cuda:
|
216
217
|
OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
@@ -331,7 +332,7 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
|
|
331
332
|
).sum(dim=1)
|
332
333
|
|
333
334
|
|
334
|
-
class TestW8A8BlockFP8FusedMoE(
|
335
|
+
class TestW8A8BlockFP8FusedMoE(CustomTestCase):
|
335
336
|
DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
336
337
|
M = [1, 33, 64, 222, 1024 * 128]
|
337
338
|
N = [128, 1024, 2048]
|
sglang/test/test_block_fp8_ep.py
CHANGED
@@ -13,6 +13,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
|
|
13
13
|
silu_and_mul_triton_kernel,
|
14
14
|
)
|
15
15
|
from sglang.srt.layers.moe.topk import select_experts
|
16
|
+
from sglang.test.test_utils import CustomTestCase
|
16
17
|
|
17
18
|
|
18
19
|
# For test
|
@@ -232,7 +233,7 @@ def block_dequant(
|
|
232
233
|
return x_dq_block
|
233
234
|
|
234
235
|
|
235
|
-
class TestW8A8BlockFP8EPMoE(
|
236
|
+
class TestW8A8BlockFP8EPMoE(CustomTestCase):
|
236
237
|
DTYPES = [torch.half, torch.bfloat16]
|
237
238
|
M = [1, 222, 1024, 2048]
|
238
239
|
N = [128, 1024, 2048]
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
from sglang.srt.utils import DynamicGradMode
|
6
|
+
from sglang.test.test_utils import CustomTestCase
|
7
|
+
|
8
|
+
|
9
|
+
class TestDynamicGradMode(CustomTestCase):
|
10
|
+
def test_inference(self):
|
11
|
+
# Test inference_mode
|
12
|
+
DynamicGradMode.set_inference_mode(True)
|
13
|
+
|
14
|
+
@DynamicGradMode()
|
15
|
+
def create_tensor_x():
|
16
|
+
return torch.empty(0)
|
17
|
+
|
18
|
+
X = create_tensor_x()
|
19
|
+
self.assertTrue(not X.requires_grad and X.is_inference())
|
20
|
+
|
21
|
+
def test_no_grad(self):
|
22
|
+
# Test no_grad
|
23
|
+
DynamicGradMode.set_inference_mode(False)
|
24
|
+
|
25
|
+
@DynamicGradMode()
|
26
|
+
def create_tensor_y():
|
27
|
+
return torch.empty(0)
|
28
|
+
|
29
|
+
Y = create_tensor_y()
|
30
|
+
self.assertTrue(not Y.requires_grad and not Y.is_inference())
|
31
|
+
|
32
|
+
def test_nested_inference(self):
|
33
|
+
# Test no_grad nested inference_mode, inference_mode should has higher priority
|
34
|
+
DynamicGradMode.set_inference_mode(False)
|
35
|
+
|
36
|
+
@DynamicGradMode()
|
37
|
+
def create_tensor_z():
|
38
|
+
with torch.inference_mode():
|
39
|
+
return torch.empty(0)
|
40
|
+
|
41
|
+
Z = create_tensor_z()
|
42
|
+
self.assertTrue(not Z.requires_grad and Z.is_inference())
|
43
|
+
|
44
|
+
def test_nested_no_grad(self):
|
45
|
+
# Test inference_mode nested no_grad, inference_mode should has higher priority
|
46
|
+
DynamicGradMode.set_inference_mode(True)
|
47
|
+
|
48
|
+
@DynamicGradMode()
|
49
|
+
def create_tensor_w():
|
50
|
+
with torch.no_grad():
|
51
|
+
return torch.empty(0)
|
52
|
+
|
53
|
+
W = create_tensor_w()
|
54
|
+
self.assertTrue(not W.requires_grad and W.is_inference())
|
55
|
+
|
56
|
+
|
57
|
+
if __name__ == "__main__":
|
58
|
+
unittest.main(verbosity=2)
|
sglang/test/test_layernorm.py
CHANGED
@@ -4,9 +4,10 @@ import unittest
|
|
4
4
|
import torch
|
5
5
|
|
6
6
|
from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
|
7
|
+
from sglang.test.test_utils import CustomTestCase
|
7
8
|
|
8
9
|
|
9
|
-
class TestRMSNorm(
|
10
|
+
class TestRMSNorm(CustomTestCase):
|
10
11
|
DTYPES = [torch.half, torch.bfloat16]
|
11
12
|
NUM_TOKENS = [7, 83, 4096]
|
12
13
|
HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
|
@@ -56,7 +57,7 @@ class TestRMSNorm(unittest.TestCase):
|
|
56
57
|
self._run_rms_norm_test(*params)
|
57
58
|
|
58
59
|
|
59
|
-
class TestGemmaRMSNorm(
|
60
|
+
class TestGemmaRMSNorm(CustomTestCase):
|
60
61
|
DTYPES = [torch.half, torch.bfloat16]
|
61
62
|
NUM_TOKENS = [7, 83, 4096]
|
62
63
|
HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
|
sglang/test/test_utils.py
CHANGED
@@ -1,15 +1,17 @@
|
|
1
1
|
"""Common utilities for testing and benchmarking"""
|
2
2
|
|
3
3
|
import argparse
|
4
|
-
import asyncio
|
5
4
|
import copy
|
5
|
+
import logging
|
6
6
|
import os
|
7
7
|
import random
|
8
8
|
import subprocess
|
9
9
|
import threading
|
10
10
|
import time
|
11
|
+
import traceback
|
11
12
|
import unittest
|
12
13
|
from concurrent.futures import ThreadPoolExecutor
|
14
|
+
from dataclasses import dataclass
|
13
15
|
from functools import partial
|
14
16
|
from types import SimpleNamespace
|
15
17
|
from typing import Callable, List, Optional, Tuple
|
@@ -27,11 +29,20 @@ from sglang.srt.utils import get_bool_env_var, kill_process_tree
|
|
27
29
|
from sglang.test.run_eval import run_eval
|
28
30
|
from sglang.utils import get_exception_traceback
|
29
31
|
|
30
|
-
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
32
|
+
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
31
33
|
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
32
34
|
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
|
33
35
|
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
|
34
36
|
)
|
37
|
+
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
|
38
|
+
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
39
|
+
)
|
40
|
+
# TODO(yundai424): right now specifying to an older revision since the latest one
|
41
|
+
# carries kv cache quantization which doesn't work yet
|
42
|
+
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION = (
|
43
|
+
"13858565416dbdc0b4e7a4a677fadfbd5b9e5bb9"
|
44
|
+
)
|
45
|
+
|
35
46
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
36
47
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
37
48
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
@@ -52,7 +63,6 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
|
|
52
63
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
53
64
|
DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
|
54
65
|
|
55
|
-
|
56
66
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
57
67
|
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
58
68
|
|
@@ -423,6 +433,11 @@ def popen_launch_server(
|
|
423
433
|
return process
|
424
434
|
except requests.RequestException:
|
425
435
|
pass
|
436
|
+
|
437
|
+
return_code = process.poll()
|
438
|
+
if return_code is not None:
|
439
|
+
raise Exception(f"Server unexpectedly exits ({return_code=}).")
|
440
|
+
|
426
441
|
time.sleep(10)
|
427
442
|
|
428
443
|
kill_process_tree(process.pid)
|
@@ -453,7 +468,13 @@ def run_with_timeout(
|
|
453
468
|
return ret_value[0]
|
454
469
|
|
455
470
|
|
456
|
-
|
471
|
+
@dataclass
|
472
|
+
class TestFile:
|
473
|
+
name: str
|
474
|
+
estimated_time: float = 60
|
475
|
+
|
476
|
+
|
477
|
+
def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
457
478
|
tic = time.time()
|
458
479
|
success = True
|
459
480
|
|
@@ -870,7 +891,6 @@ def run_mulit_request_test(
|
|
870
891
|
enable_overlap=False,
|
871
892
|
chunked_prefill_size=32,
|
872
893
|
):
|
873
|
-
|
874
894
|
def workload_func(base_url, model):
|
875
895
|
def run_one(_):
|
876
896
|
prompt = """
|
@@ -905,6 +925,10 @@ def run_mulit_request_test(
|
|
905
925
|
|
906
926
|
|
907
927
|
def write_github_step_summary(content):
|
928
|
+
if not os.environ.get("GITHUB_STEP_SUMMARY"):
|
929
|
+
logging.warning("GITHUB_STEP_SUMMARY environment variable not set")
|
930
|
+
return
|
931
|
+
|
908
932
|
with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
|
909
933
|
f.write(content)
|
910
934
|
|
@@ -982,3 +1006,30 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
|
|
982
1006
|
rank += 1
|
983
1007
|
else:
|
984
1008
|
raise
|
1009
|
+
|
1010
|
+
|
1011
|
+
class CustomTestCase(unittest.TestCase):
|
1012
|
+
def _callTestMethod(self, method):
|
1013
|
+
_retry_execution(
|
1014
|
+
lambda: super(CustomTestCase, self)._callTestMethod(method),
|
1015
|
+
max_retry=_get_max_retry(),
|
1016
|
+
)
|
1017
|
+
|
1018
|
+
|
1019
|
+
def _get_max_retry():
|
1020
|
+
return int(os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0"))
|
1021
|
+
|
1022
|
+
|
1023
|
+
def _retry_execution(fn, max_retry: int):
|
1024
|
+
if max_retry == 0:
|
1025
|
+
fn()
|
1026
|
+
return
|
1027
|
+
|
1028
|
+
try:
|
1029
|
+
fn()
|
1030
|
+
except Exception as e:
|
1031
|
+
print(
|
1032
|
+
f"retry_execution failed once and will retry. This may be an error or a flaky test. Error: {e}"
|
1033
|
+
)
|
1034
|
+
traceback.print_exc()
|
1035
|
+
_retry_execution(fn, max_retry=max_retry - 1)
|
sglang/utils.py
CHANGED
@@ -22,6 +22,7 @@ from typing import Any, Callable, List, Optional, Tuple, Type, Union
|
|
22
22
|
import numpy as np
|
23
23
|
import requests
|
24
24
|
from IPython.display import HTML, display
|
25
|
+
from pydantic import BaseModel
|
25
26
|
from tqdm import tqdm
|
26
27
|
|
27
28
|
from sglang.srt.utils import kill_process_tree
|
@@ -29,6 +30,36 @@ from sglang.srt.utils import kill_process_tree
|
|
29
30
|
logger = logging.getLogger(__name__)
|
30
31
|
|
31
32
|
|
33
|
+
def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str:
|
34
|
+
"""Convert a JSON schema to a string.
|
35
|
+
Parameters
|
36
|
+
----------
|
37
|
+
json_schema
|
38
|
+
The JSON schema.
|
39
|
+
Returns
|
40
|
+
-------
|
41
|
+
str
|
42
|
+
The JSON schema converted to a string.
|
43
|
+
Raises
|
44
|
+
------
|
45
|
+
ValueError
|
46
|
+
If the schema is not a dictionary, a string or a Pydantic class.
|
47
|
+
"""
|
48
|
+
if isinstance(json_schema, dict):
|
49
|
+
schema_str = json.dumps(json_schema)
|
50
|
+
elif isinstance(json_schema, str):
|
51
|
+
schema_str = json_schema
|
52
|
+
elif issubclass(json_schema, BaseModel):
|
53
|
+
schema_str = json.dumps(json_schema.model_json_schema())
|
54
|
+
else:
|
55
|
+
raise ValueError(
|
56
|
+
f"Cannot parse schema {json_schema}. The schema must be either "
|
57
|
+
+ "a Pydantic class, a dictionary or a string that contains the JSON "
|
58
|
+
+ "schema specification"
|
59
|
+
)
|
60
|
+
return schema_str
|
61
|
+
|
62
|
+
|
32
63
|
def get_exception_traceback():
|
33
64
|
etype, value, tb = sys.exc_info()
|
34
65
|
err_str = "".join(traceback.format_exception(etype, value, tb))
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.4.
|
1
|
+
__version__ = "0.4.4.post3"
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.4.
|
3
|
+
Version: 0.4.4.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -218,6 +218,7 @@ Requires-Dist: numpy
|
|
218
218
|
Requires-Dist: IPython
|
219
219
|
Requires-Dist: setproctitle
|
220
220
|
Provides-Extra: runtime-common
|
221
|
+
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
221
222
|
Requires-Dist: datasets; extra == "runtime-common"
|
222
223
|
Requires-Dist: decord; extra == "runtime-common"
|
223
224
|
Requires-Dist: fastapi; extra == "runtime-common"
|
@@ -235,19 +236,22 @@ Requires-Dist: psutil; extra == "runtime-common"
|
|
235
236
|
Requires-Dist: pydantic; extra == "runtime-common"
|
236
237
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
237
238
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
239
|
+
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
238
240
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
239
|
-
Requires-Dist: transformers==4.
|
241
|
+
Requires-Dist: transformers==4.50.0; extra == "runtime-common"
|
240
242
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
241
243
|
Requires-Dist: uvloop; extra == "runtime-common"
|
242
|
-
Requires-Dist:
|
244
|
+
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
245
|
+
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
243
246
|
Provides-Extra: srt
|
244
247
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
245
|
-
Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
|
248
|
+
Requires-Dist: sgl-kernel==0.0.5.post4; extra == "srt"
|
246
249
|
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
247
250
|
Requires-Dist: torch==2.5.1; extra == "srt"
|
248
|
-
Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
|
249
251
|
Requires-Dist: cuda-python; extra == "srt"
|
250
252
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
253
|
+
Requires-Dist: partial_json_parser; extra == "srt"
|
254
|
+
Requires-Dist: einops; extra == "srt"
|
251
255
|
Provides-Extra: srt-hip
|
252
256
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
253
257
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -271,7 +275,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
|
271
275
|
Provides-Extra: litellm
|
272
276
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
273
277
|
Provides-Extra: torch-memory-saver
|
274
|
-
Requires-Dist: torch_memory_saver; extra == "torch-memory-saver"
|
278
|
+
Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
|
275
279
|
Provides-Extra: test
|
276
280
|
Requires-Dist: jsonlines; extra == "test"
|
277
281
|
Requires-Dist: matplotlib; extra == "test"
|
@@ -319,6 +323,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
|
|
319
323
|
Provides-Extra: dev-cpu
|
320
324
|
Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
|
321
325
|
Requires-Dist: sglang[test]; extra == "dev-cpu"
|
326
|
+
Dynamic: license-file
|
322
327
|
|
323
328
|
<div align="center" id="sglangtop">
|
324
329
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -342,6 +347,9 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
|
|
342
347
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
343
348
|
|
344
349
|
## News
|
350
|
+
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
351
|
+
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
352
|
+
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
345
353
|
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
346
354
|
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
347
355
|
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -362,7 +370,7 @@ SGLang is a fast serving framework for large language models and vision language
|
|
362
370
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
363
371
|
The core features include:
|
364
372
|
|
365
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching,
|
373
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
|
366
374
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
367
375
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
368
376
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|