PyPI - sglang - Versions diffs - 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +26 -4
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +676 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +49 -8
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +42 -8
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +78 -13
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +133 -55
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +434 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +41 -19
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +25 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/topk.py +60 -20
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +80 -53
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +25 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -19
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +78 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +87 -33
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +67 -30
sglang/srt/lora/mem_pool.py +117 -52
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +18 -1
sglang/srt/managers/cache_controller.py +2 -5
sglang/srt/managers/data_parallel_controller.py +30 -8
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +43 -5
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +134 -30
sglang/srt/managers/scheduler.py +290 -31
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -24
sglang/srt/managers/tp_worker.py +4 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +18 -7
sglang/srt/mem_cache/memory_pool.py +255 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +36 -21
sglang/srt/model_executor/forward_batch_info.py +68 -11
sglang/srt/model_executor/model_runner.py +75 -8
sglang/srt/model_loader/loader.py +171 -3
sglang/srt/model_loader/weight_utils.py +51 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +31 -88
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +329 -73
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +694 -0
sglang/srt/models/gemma3_mm.py +468 -0
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +201 -104
sglang/srt/openai_api/protocol.py +33 -7
sglang/srt/patch_torch.py +71 -0
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +114 -14
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +140 -54
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +215 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +29 -2
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +56 -5
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/test/runners.py CHANGED Viewed

@@ -19,10 +19,16 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
-from transformers import AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+)
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.server import Engine
+from sglang.srt.utils import load_image
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
 DEFAULT_PROMPTS = [
@@ -140,7 +146,6 @@ class HFRunner:
     def _get_gme_qwen2_vl_embeddings(
         self, prompts, image_data: Optional[List[str]] = None
     ):
-        from sglang.srt.utils import load_image
         images = None
         if image_data is not None:
@@ -226,6 +231,9 @@ class HFRunner:
                     low_cpu_mem_usage=True,
                 ).cuda()
                 self.processor = AutoProcessor.from_pretrained(model_path)
+            elif "clip" in model_path.lower():
+                self.model = AutoModel.from_pretrained(model_path).cuda()
+                self.processor = AutoProcessor.from_pretrained(model_path)
             else:
                 self.model = _get_sentence_transformer_embedding_model(
                     model_path, torch_dtype
@@ -272,6 +280,23 @@ class HFRunner:
                     assert not self.output_str_only
                     if "gme-qwen2-vl" in model_path.lower():
                         logits = self._get_gme_qwen2_vl_embeddings(prompts, image_data)
+                    elif "clip" in model_path.lower():
+                        if image_data is not None:
+                            image = load_image(image_data)
+                            inputs = self.processor(
+                                images=image[0], return_tensors="pt"
+                            )
+                            logits = self.model.get_image_features(
+                                pixel_values=inputs.data["pixel_values"].cuda(),
+                            ).tolist()
+                        else:
+                            inputs = self.tokenizer(
+                                prompts, padding=True, return_tensors="pt"
+                            )
+                            logits = self.model.get_text_features(
+                                input_ids=inputs.data["input_ids"].cuda(),
+                                attention_mask=inputs.data["attention_mask"].cuda(),
+                            ).tolist()
                     else:
                         logits = self.model.encode(prompts).tolist()
                     out_queue.put(ModelOutput(embed_logits=logits))
@@ -437,6 +462,7 @@ class SRTRunner:
         speculative_eagle_topk: Optional[int] = None,
         speculative_num_draft_tokens: Optional[int] = None,
         disable_overlap_schedule: bool = False,
+        disable_custom_all_reduce: bool = False,
     ):
         self.model_type = model_type
         self.is_generation = model_type == "generation"
@@ -470,6 +496,7 @@ class SRTRunner:
             enable_ep_moe=enable_ep_moe,
             disable_overlap_schedule=disable_overlap_schedule,
             cuda_graph_max_bs=4,
+            disable_custom_all_reduce=disable_custom_all_reduce,
             **spec_kwargs,
         )

sglang/test/test_activation.py CHANGED Viewed

@@ -4,9 +4,10 @@ import unittest
 import torch
 from sglang.srt.layers.activation import GeluAndMul
+from sglang.test.test_utils import CustomTestCase
-class TestGeluAndMul(unittest.TestCase):
+class TestGeluAndMul(CustomTestCase):
     DTYPES = [torch.half, torch.bfloat16]
     NUM_TOKENS = [7, 83, 2048]
     D = [512, 4096, 5120, 13824]

sglang/test/test_block_fp8.py CHANGED Viewed

@@ -11,6 +11,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
     static_quant_fp8,
     w8a8_block_fp8_matmul,
 )
+from sglang.test.test_utils import CustomTestCase
 _is_cuda = torch.cuda.is_available() and torch.version.cuda
@@ -44,7 +45,7 @@ def native_per_token_group_quant_fp8(
     return x_q, x_s
-class TestPerTokenGroupQuantFP8(unittest.TestCase):
+class TestPerTokenGroupQuantFP8(CustomTestCase):
     DTYPES = [torch.half, torch.bfloat16, torch.float32]
     NUM_TOKENS = [7, 83, 2048]
     D = [512, 4096, 5120, 13824]
@@ -111,7 +112,7 @@ def native_static_quant_fp8(x, x_s, dtype=torch.float8_e4m3fn):
     return x_q, x_s
-class TestStaticQuantFP8(unittest.TestCase):
+class TestStaticQuantFP8(CustomTestCase):
     DTYPES = [torch.half, torch.bfloat16, torch.float32]
     NUM_TOKENS = [7, 83, 2048]
     D = [512, 4096, 5120, 13824]
@@ -210,7 +211,7 @@ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.fl
     return C
-class TestW8A8BlockFP8Matmul(unittest.TestCase):
+class TestW8A8BlockFP8Matmul(CustomTestCase):
     if not _is_cuda:
         OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
@@ -331,7 +332,7 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
     ).sum(dim=1)
-class TestW8A8BlockFP8FusedMoE(unittest.TestCase):
+class TestW8A8BlockFP8FusedMoE(CustomTestCase):
     DTYPES = [torch.float32, torch.half, torch.bfloat16]
     M = [1, 33, 64, 222, 1024 * 128]
     N = [128, 1024, 2048]

sglang/test/test_block_fp8_ep.py CHANGED Viewed

@@ -13,6 +13,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
     silu_and_mul_triton_kernel,
 )
 from sglang.srt.layers.moe.topk import select_experts
+from sglang.test.test_utils import CustomTestCase
 # For test
@@ -232,7 +233,7 @@ def block_dequant(
     return x_dq_block
-class TestW8A8BlockFP8EPMoE(unittest.TestCase):
+class TestW8A8BlockFP8EPMoE(CustomTestCase):
     DTYPES = [torch.half, torch.bfloat16]
     M = [1, 222, 1024, 2048]
     N = [128, 1024, 2048]

sglang/test/test_dynamic_grad_mode.py ADDED Viewed

@@ -0,0 +1,58 @@
+import unittest
+import torch
+from sglang.srt.utils import DynamicGradMode
+from sglang.test.test_utils import CustomTestCase
+class TestDynamicGradMode(CustomTestCase):
+    def test_inference(self):
+        # Test inference_mode
+        DynamicGradMode.set_inference_mode(True)
+        @DynamicGradMode()
+        def create_tensor_x():
+            return torch.empty(0)
+        X = create_tensor_x()
+        self.assertTrue(not X.requires_grad and X.is_inference())
+    def test_no_grad(self):
+        # Test no_grad
+        DynamicGradMode.set_inference_mode(False)
+        @DynamicGradMode()
+        def create_tensor_y():
+            return torch.empty(0)
+        Y = create_tensor_y()
+        self.assertTrue(not Y.requires_grad and not Y.is_inference())
+    def test_nested_inference(self):
+        # Test no_grad nested inference_mode, inference_mode should has higher priority
+        DynamicGradMode.set_inference_mode(False)
+        @DynamicGradMode()
+        def create_tensor_z():
+            with torch.inference_mode():
+                return torch.empty(0)
+        Z = create_tensor_z()
+        self.assertTrue(not Z.requires_grad and Z.is_inference())
+    def test_nested_no_grad(self):
+        # Test inference_mode nested no_grad, inference_mode should has higher priority
+        DynamicGradMode.set_inference_mode(True)
+        @DynamicGradMode()
+        def create_tensor_w():
+            with torch.no_grad():
+                return torch.empty(0)
+        W = create_tensor_w()
+        self.assertTrue(not W.requires_grad and W.is_inference())
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

sglang/test/test_layernorm.py CHANGED Viewed

@@ -4,9 +4,10 @@ import unittest
 import torch
 from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
+from sglang.test.test_utils import CustomTestCase
-class TestRMSNorm(unittest.TestCase):
+class TestRMSNorm(CustomTestCase):
     DTYPES = [torch.half, torch.bfloat16]
     NUM_TOKENS = [7, 83, 4096]
     HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
@@ -56,7 +57,7 @@ class TestRMSNorm(unittest.TestCase):
                 self._run_rms_norm_test(*params)
-class TestGemmaRMSNorm(unittest.TestCase):
+class TestGemmaRMSNorm(CustomTestCase):
     DTYPES = [torch.half, torch.bfloat16]
     NUM_TOKENS = [7, 83, 4096]
     HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]

sglang/test/test_utils.py CHANGED Viewed

@@ -1,15 +1,17 @@
 """Common utilities for testing and benchmarking"""
 import argparse
-import asyncio
 import copy
+import logging
 import os
 import random
 import subprocess
 import threading
 import time
+import traceback
 import unittest
 from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
 from functools import partial
 from types import SimpleNamespace
 from typing import Callable, List, Optional, Tuple
@@ -27,11 +29,20 @@ from sglang.srt.utils import get_bool_env_var, kill_process_tree
 from sglang.test.run_eval import run_eval
 from sglang.utils import get_exception_traceback
-DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
+DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
 DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
     "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
 )
+DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
+    "nvidia/Llama-3.1-8B-Instruct-FP8"
+)
+# TODO(yundai424): right now specifying to an older revision since the latest one
+#  carries kv cache quantization which doesn't work yet
+DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION = (
+    "13858565416dbdc0b4e7a4a677fadfbd5b9e5bb9"
+)
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
@@ -52,7 +63,6 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
 DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
 DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
 DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
@@ -423,6 +433,11 @@ def popen_launch_server(
                     return process
             except requests.RequestException:
                 pass
+            return_code = process.poll()
+            if return_code is not None:
+                raise Exception(f"Server unexpectedly exits ({return_code=}).")
             time.sleep(10)
     kill_process_tree(process.pid)
@@ -453,7 +468,13 @@ def run_with_timeout(
     return ret_value[0]
-def run_unittest_files(files: List, timeout_per_file: float):
+@dataclass
+class TestFile:
+    name: str
+    estimated_time: float = 60
+def run_unittest_files(files: List[TestFile], timeout_per_file: float):
     tic = time.time()
     success = True
@@ -870,7 +891,6 @@ def run_mulit_request_test(
     enable_overlap=False,
     chunked_prefill_size=32,
 ):
     def workload_func(base_url, model):
         def run_one(_):
             prompt = """
@@ -905,6 +925,10 @@ def run_mulit_request_test(
 def write_github_step_summary(content):
+    if not os.environ.get("GITHUB_STEP_SUMMARY"):
+        logging.warning("GITHUB_STEP_SUMMARY environment variable not set")
+        return
     with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
         f.write(content)
@@ -982,3 +1006,30 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
                                 rank += 1
                             else:
                                 raise
+class CustomTestCase(unittest.TestCase):
+    def _callTestMethod(self, method):
+        _retry_execution(
+            lambda: super(CustomTestCase, self)._callTestMethod(method),
+            max_retry=_get_max_retry(),
+        )
+def _get_max_retry():
+    return int(os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0"))
+def _retry_execution(fn, max_retry: int):
+    if max_retry == 0:
+        fn()
+        return
+    try:
+        fn()
+    except Exception as e:
+        print(
+            f"retry_execution failed once and will retry. This may be an error or a flaky test. Error: {e}"
+        )
+        traceback.print_exc()
+        _retry_execution(fn, max_retry=max_retry - 1)

sglang/utils.py CHANGED Viewed

@@ -22,6 +22,7 @@ from typing import Any, Callable, List, Optional, Tuple, Type, Union
 import numpy as np
 import requests
 from IPython.display import HTML, display
+from pydantic import BaseModel
 from tqdm import tqdm
 from sglang.srt.utils import kill_process_tree
@@ -29,6 +30,36 @@ from sglang.srt.utils import kill_process_tree
 logger = logging.getLogger(__name__)
+def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str:
+    """Convert a JSON schema to a string.
+    Parameters
+    ----------
+    json_schema
+        The JSON schema.
+    Returns
+    -------
+    str
+        The JSON schema converted to a string.
+    Raises
+    ------
+    ValueError
+        If the schema is not a dictionary, a string or a Pydantic class.
+    """
+    if isinstance(json_schema, dict):
+        schema_str = json.dumps(json_schema)
+    elif isinstance(json_schema, str):
+        schema_str = json_schema
+    elif issubclass(json_schema, BaseModel):
+        schema_str = json.dumps(json_schema.model_json_schema())
+    else:
+        raise ValueError(
+            f"Cannot parse schema {json_schema}. The schema must be either "
+            + "a Pydantic class, a dictionary or a string that contains the JSON "
+            + "schema specification"
+        )
+    return schema_str
 def get_exception_traceback():
     etype, value, tb = sys.exc_info()
     err_str = "".join(traceback.format_exception(etype, value, tb))

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.4.~~post1~~"
1	+ __version__ = "0.4.4.post3"

{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.4.post1
+Version: 0.4.4.post3
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -218,6 +218,7 @@ Requires-Dist: numpy
 Requires-Dist: IPython
 Requires-Dist: setproctitle
 Provides-Extra: runtime-common
+Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: datasets; extra == "runtime-common"
 Requires-Dist: decord; extra == "runtime-common"
 Requires-Dist: fastapi; extra == "runtime-common"
@@ -235,19 +236,22 @@ Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
+Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
-Requires-Dist: transformers==4.48.3; extra == "runtime-common"
+Requires-Dist: transformers==4.50.0; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.15; extra == "runtime-common"
+Requires-Dist: compressed-tensors; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
+Requires-Dist: sgl-kernel==0.0.5.post4; extra == "srt"
 Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
 Requires-Dist: torch==2.5.1; extra == "srt"
-Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
+Requires-Dist: partial_json_parser; extra == "srt"
+Requires-Dist: einops; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -271,7 +275,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
 Provides-Extra: litellm
 Requires-Dist: litellm>=1.0.0; extra == "litellm"
 Provides-Extra: torch-memory-saver
-Requires-Dist: torch_memory_saver; extra == "torch-memory-saver"
+Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
 Provides-Extra: test
 Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
@@ -319,6 +323,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
 Provides-Extra: dev-cpu
 Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
 Requires-Dist: sglang[test]; extra == "dev-cpu"
+Dynamic: license-file
 <div align="center" id="sglangtop">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -342,6 +347,9 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
+- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
+- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
 - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -362,7 +370,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.

sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl