sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +149 -34
 - sglang/bench_serving.py +18 -3
 - sglang/compile_deep_gemm.py +13 -7
 - sglang/srt/batch_invariant_ops/__init__.py +2 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
 - sglang/srt/checkpoint_engine/__init__.py +9 -0
 - sglang/srt/checkpoint_engine/update.py +317 -0
 - sglang/srt/configs/__init__.py +2 -0
 - sglang/srt/configs/deepseek_ocr.py +542 -10
 - sglang/srt/configs/deepseekvl2.py +95 -194
 - sglang/srt/configs/kimi_linear.py +160 -0
 - sglang/srt/configs/mamba_utils.py +66 -0
 - sglang/srt/configs/model_config.py +25 -2
 - sglang/srt/constants.py +7 -0
 - sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
 - sglang/srt/disaggregation/decode.py +34 -6
 - sglang/srt/disaggregation/nixl/conn.py +2 -2
 - sglang/srt/disaggregation/prefill.py +25 -3
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
 - sglang/srt/distributed/parallel_state.py +9 -5
 - sglang/srt/entrypoints/engine.py +13 -5
 - sglang/srt/entrypoints/http_server.py +22 -3
 - sglang/srt/entrypoints/openai/protocol.py +7 -1
 - sglang/srt/entrypoints/openai/serving_chat.py +42 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +10 -0
 - sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
 - sglang/srt/environ.py +7 -0
 - sglang/srt/eplb/expert_distribution.py +34 -1
 - sglang/srt/eplb/expert_location.py +106 -36
 - sglang/srt/grpc/compile_proto.py +3 -0
 - sglang/srt/layers/attention/ascend_backend.py +233 -5
 - sglang/srt/layers/attention/attention_registry.py +3 -0
 - sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
 - sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
 - sglang/srt/layers/attention/fla/kda.py +1359 -0
 - sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
 - sglang/srt/layers/attention/flashattention_backend.py +7 -6
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
 - sglang/srt/layers/attention/flashmla_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
 - sglang/srt/layers/attention/mamba/mamba.py +20 -11
 - sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
 - sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
 - sglang/srt/layers/attention/nsa/transform_index.py +1 -1
 - sglang/srt/layers/attention/nsa_backend.py +157 -23
 - sglang/srt/layers/attention/triton_backend.py +4 -1
 - sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
 - sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
 - sglang/srt/layers/communicator.py +23 -1
 - sglang/srt/layers/layernorm.py +16 -2
 - sglang/srt/layers/logits_processor.py +4 -20
 - sglang/srt/layers/moe/ep_moe/layer.py +0 -18
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
 - sglang/srt/layers/moe/topk.py +31 -6
 - sglang/srt/layers/pooler.py +21 -2
 - sglang/srt/layers/quantization/__init__.py +9 -78
 - sglang/srt/layers/quantization/auto_round.py +394 -0
 - sglang/srt/layers/quantization/fp8_kernel.py +1 -1
 - sglang/srt/layers/quantization/fp8_utils.py +2 -2
 - sglang/srt/layers/quantization/modelopt_quant.py +168 -11
 - sglang/srt/layers/rotary_embedding.py +117 -45
 - sglang/srt/lora/lora_registry.py +9 -0
 - sglang/srt/managers/async_mm_data_processor.py +122 -0
 - sglang/srt/managers/data_parallel_controller.py +30 -3
 - sglang/srt/managers/detokenizer_manager.py +3 -0
 - sglang/srt/managers/io_struct.py +26 -4
 - sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
 - sglang/srt/managers/schedule_batch.py +74 -15
 - sglang/srt/managers/scheduler.py +164 -129
 - sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
 - sglang/srt/managers/scheduler_pp_mixin.py +7 -2
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
 - sglang/srt/managers/session_controller.py +6 -5
 - sglang/srt/managers/tokenizer_manager.py +154 -59
 - sglang/srt/managers/tp_worker.py +24 -1
 - sglang/srt/mem_cache/base_prefix_cache.py +23 -4
 - sglang/srt/mem_cache/common.py +1 -0
 - sglang/srt/mem_cache/memory_pool.py +171 -57
 - sglang/srt/mem_cache/memory_pool_host.py +12 -5
 - sglang/srt/mem_cache/radix_cache.py +4 -0
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
 - sglang/srt/metrics/collector.py +46 -3
 - sglang/srt/model_executor/cuda_graph_runner.py +15 -3
 - sglang/srt/model_executor/forward_batch_info.py +11 -11
 - sglang/srt/model_executor/model_runner.py +76 -21
 - sglang/srt/model_executor/npu_graph_runner.py +7 -3
 - sglang/srt/model_loader/weight_utils.py +1 -1
 - sglang/srt/models/bailing_moe.py +9 -2
 - sglang/srt/models/deepseek_nextn.py +11 -2
 - sglang/srt/models/deepseek_v2.py +149 -34
 - sglang/srt/models/glm4.py +391 -77
 - sglang/srt/models/glm4v.py +196 -55
 - sglang/srt/models/glm4v_moe.py +0 -1
 - sglang/srt/models/gpt_oss.py +1 -10
 - sglang/srt/models/kimi_linear.py +678 -0
 - sglang/srt/models/llama4.py +1 -1
 - sglang/srt/models/llama_eagle3.py +11 -1
 - sglang/srt/models/longcat_flash.py +2 -2
 - sglang/srt/models/minimax_m2.py +1 -1
 - sglang/srt/models/qwen2.py +1 -1
 - sglang/srt/models/qwen2_moe.py +30 -15
 - sglang/srt/models/qwen3.py +1 -1
 - sglang/srt/models/qwen3_moe.py +16 -8
 - sglang/srt/models/qwen3_next.py +7 -0
 - sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
 - sglang/srt/multiplex/multiplexing_mixin.py +209 -0
 - sglang/srt/multiplex/pdmux_context.py +164 -0
 - sglang/srt/parser/conversation.py +7 -1
 - sglang/srt/sampling/custom_logit_processor.py +67 -1
 - sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
 - sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
 - sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
 - sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
 - sglang/srt/server_args.py +103 -22
 - sglang/srt/single_batch_overlap.py +4 -1
 - sglang/srt/speculative/draft_utils.py +16 -0
 - sglang/srt/speculative/eagle_info.py +42 -36
 - sglang/srt/speculative/eagle_info_v2.py +68 -25
 - sglang/srt/speculative/eagle_utils.py +261 -16
 - sglang/srt/speculative/eagle_worker.py +11 -3
 - sglang/srt/speculative/eagle_worker_v2.py +15 -9
 - sglang/srt/speculative/spec_info.py +305 -31
 - sglang/srt/speculative/spec_utils.py +44 -8
 - sglang/srt/tracing/trace.py +121 -12
 - sglang/srt/utils/common.py +55 -32
 - sglang/srt/utils/hf_transformers_utils.py +38 -16
 - sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
 - sglang/test/kits/radix_cache_server_kit.py +50 -0
 - sglang/test/runners.py +31 -7
 - sglang/test/simple_eval_common.py +5 -3
 - sglang/test/simple_eval_humaneval.py +1 -0
 - sglang/test/simple_eval_math.py +1 -0
 - sglang/test/simple_eval_mmlu.py +1 -0
 - sglang/test/simple_eval_mmmu_vlm.py +1 -0
 - sglang/test/test_utils.py +7 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
 - {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
 - /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
 - {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
 - {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
 
    
        sglang/test/runners.py
    CHANGED
    
    | 
         @@ -12,10 +12,11 @@ 
     | 
|
| 
       12 
12 
     | 
    
         
             
            # limitations under the License.
         
     | 
| 
       13 
13 
     | 
    
         
             
            # ==============================================================================
         
     | 
| 
       14 
14 
     | 
    
         | 
| 
      
 15 
     | 
    
         
            +
            import json
         
     | 
| 
       15 
16 
     | 
    
         
             
            import multiprocessing as mp
         
     | 
| 
       16 
17 
     | 
    
         
             
            import os
         
     | 
| 
       17 
18 
     | 
    
         
             
            from dataclasses import dataclass
         
     | 
| 
       18 
     | 
    
         
            -
            from typing import List, Optional, Tuple, Union
         
     | 
| 
      
 19 
     | 
    
         
            +
            from typing import Any, List, Optional, Tuple, Union
         
     | 
| 
       19 
20 
     | 
    
         | 
| 
       20 
21 
     | 
    
         
             
            import torch
         
     | 
| 
       21 
22 
     | 
    
         
             
            import torch.nn.functional as F
         
     | 
| 
         @@ -89,7 +90,9 @@ def get_token_ids_logprobs(logits, token_ids): 
     | 
|
| 
       89 
90 
     | 
    
         
             
                return logprobs
         
     | 
| 
       90 
91 
     | 
    
         | 
| 
       91 
92 
     | 
    
         | 
| 
       92 
     | 
    
         
            -
            def _get_sentence_transformer_embedding_model( 
     | 
| 
      
 93 
     | 
    
         
            +
            def _get_sentence_transformer_embedding_model(
         
     | 
| 
      
 94 
     | 
    
         
            +
                model_path, torch_dtype, matryoshka_dim: Optional[int] = None
         
     | 
| 
      
 95 
     | 
    
         
            +
            ):
         
     | 
| 
       93 
96 
     | 
    
         
             
                from sentence_transformers import SentenceTransformer
         
     | 
| 
       94 
97 
     | 
    
         
             
                from sentence_transformers.util import is_sentence_transformer_model
         
     | 
| 
       95 
98 
     | 
    
         | 
| 
         @@ -97,6 +100,7 @@ def _get_sentence_transformer_embedding_model(model_path, torch_dtype): 
     | 
|
| 
       97 
100 
     | 
    
         
             
                    model = SentenceTransformer(
         
     | 
| 
       98 
101 
     | 
    
         
             
                        model_path,
         
     | 
| 
       99 
102 
     | 
    
         
             
                        model_kwargs={"torch_dtype": torch_dtype},
         
     | 
| 
      
 103 
     | 
    
         
            +
                        truncate_dim=matryoshka_dim,
         
     | 
| 
       100 
104 
     | 
    
         
             
                    )
         
     | 
| 
       101 
105 
     | 
    
         
             
                else:  # if no pre-trained sentence-transformers model
         
     | 
| 
       102 
106 
     | 
    
         
             
                    from sentence_transformers import models
         
     | 
| 
         @@ -106,7 +110,9 @@ def _get_sentence_transformer_embedding_model(model_path, torch_dtype): 
     | 
|
| 
       106 
110 
     | 
    
         
             
                        word_embedding_model.get_word_embedding_dimension(),
         
     | 
| 
       107 
111 
     | 
    
         
             
                        pooling_mode="lasttoken",
         
     | 
| 
       108 
112 
     | 
    
         
             
                    )
         
     | 
| 
       109 
     | 
    
         
            -
                    model = SentenceTransformer( 
     | 
| 
      
 113 
     | 
    
         
            +
                    model = SentenceTransformer(
         
     | 
| 
      
 114 
     | 
    
         
            +
                        modules=[word_embedding_model, pooling_model], truncate_dim=matryoshka_dim
         
     | 
| 
      
 115 
     | 
    
         
            +
                    )
         
     | 
| 
       110 
116 
     | 
    
         | 
| 
       111 
117 
     | 
    
         
             
                return model.cuda()
         
     | 
| 
       112 
118 
     | 
    
         | 
| 
         @@ -135,6 +141,7 @@ class HFRunner: 
     | 
|
| 
       135 
141 
     | 
    
         
             
                    output_str_only: bool = False,
         
     | 
| 
       136 
142 
     | 
    
         
             
                    trust_remote_code: bool = False,
         
     | 
| 
       137 
143 
     | 
    
         
             
                    patch_model_do_sample_false: bool = False,
         
     | 
| 
      
 144 
     | 
    
         
            +
                    matryoshka_dim: Optional[int] = None,
         
     | 
| 
       138 
145 
     | 
    
         
             
                ):
         
     | 
| 
       139 
146 
     | 
    
         
             
                    self.model_type = model_type
         
     | 
| 
       140 
147 
     | 
    
         
             
                    self.output_str_only = output_str_only
         
     | 
| 
         @@ -151,6 +158,7 @@ class HFRunner: 
     | 
|
| 
       151 
158 
     | 
    
         
             
                            self.out_queue,
         
     | 
| 
       152 
159 
     | 
    
         
             
                            model_path,
         
     | 
| 
       153 
160 
     | 
    
         
             
                            torch_dtype,
         
     | 
| 
      
 161 
     | 
    
         
            +
                            matryoshka_dim,
         
     | 
| 
       154 
162 
     | 
    
         
             
                        ),
         
     | 
| 
       155 
163 
     | 
    
         
             
                    )
         
     | 
| 
       156 
164 
     | 
    
         
             
                    self.model_proc.start()
         
     | 
| 
         @@ -225,7 +233,14 @@ class HFRunner: 
     | 
|
| 
       225 
233 
     | 
    
         
             
                    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         
     | 
| 
       226 
234 
     | 
    
         
             
                    return embeddings.contiguous()
         
     | 
| 
       227 
235 
     | 
    
         | 
| 
       228 
     | 
    
         
            -
                def start_model_process( 
     | 
| 
      
 236 
     | 
    
         
            +
                def start_model_process(
         
     | 
| 
      
 237 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 238 
     | 
    
         
            +
                    in_queue,
         
     | 
| 
      
 239 
     | 
    
         
            +
                    out_queue,
         
     | 
| 
      
 240 
     | 
    
         
            +
                    model_path,
         
     | 
| 
      
 241 
     | 
    
         
            +
                    torch_dtype,
         
     | 
| 
      
 242 
     | 
    
         
            +
                    matryoshka_dim: Optional[int] = None,
         
     | 
| 
      
 243 
     | 
    
         
            +
                ):
         
     | 
| 
       229 
244 
     | 
    
         
             
                    # Apply model-specific patches
         
     | 
| 
       230 
245 
     | 
    
         
             
                    monkey_patch_gemma2_sdpa()
         
     | 
| 
       231 
246 
     | 
    
         | 
| 
         @@ -259,7 +274,7 @@ class HFRunner: 
     | 
|
| 
       259 
274 
     | 
    
         
             
                            self.processor = AutoProcessor.from_pretrained(model_path)
         
     | 
| 
       260 
275 
     | 
    
         
             
                        else:
         
     | 
| 
       261 
276 
     | 
    
         
             
                            self.model = _get_sentence_transformer_embedding_model(
         
     | 
| 
       262 
     | 
    
         
            -
                                model_path, torch_dtype
         
     | 
| 
      
 277 
     | 
    
         
            +
                                model_path, torch_dtype, matryoshka_dim=matryoshka_dim
         
     | 
| 
       263 
278 
     | 
    
         
             
                            )
         
     | 
| 
       264 
279 
     | 
    
         
             
                    elif self.model_type == "reward" or self.model_type == "cross_encoder":
         
     | 
| 
       265 
280 
     | 
    
         
             
                        from transformers import AutoModelForSequenceClassification
         
     | 
| 
         @@ -496,7 +511,7 @@ class SRTRunner: 
     | 
|
| 
       496 
511 
     | 
    
         
             
                    attention_backend: Optional[str] = None,
         
     | 
| 
       497 
512 
     | 
    
         
             
                    prefill_attention_backend: Optional[str] = None,
         
     | 
| 
       498 
513 
     | 
    
         
             
                    decode_attention_backend: Optional[str] = None,
         
     | 
| 
       499 
     | 
    
         
            -
                    lora_backend: str = " 
     | 
| 
      
 514 
     | 
    
         
            +
                    lora_backend: str = "csgmv",
         
     | 
| 
       500 
515 
     | 
    
         
             
                    disable_cuda_graph: bool = False,
         
     | 
| 
       501 
516 
     | 
    
         
             
                    disable_radix_cache: bool = False,
         
     | 
| 
       502 
517 
     | 
    
         
             
                    chunked_prefill_size: Optional[int] = None,
         
     | 
| 
         @@ -519,6 +534,7 @@ class SRTRunner: 
     | 
|
| 
       519 
534 
     | 
    
         
             
                    lora_target_modules: Optional[List[str]] = None,
         
     | 
| 
       520 
535 
     | 
    
         
             
                    enable_lora: Optional[bool] = None,
         
     | 
| 
       521 
536 
     | 
    
         
             
                    max_loaded_loras: Optional[int] = None,
         
     | 
| 
      
 537 
     | 
    
         
            +
                    json_model_override_args: Optional[dict[str, Any]] = None,
         
     | 
| 
       522 
538 
     | 
    
         
             
                    lora_eviction_policy: str = "lru",
         
     | 
| 
       523 
539 
     | 
    
         
             
                ):
         
     | 
| 
       524 
540 
     | 
    
         
             
                    self.model_type = model_type
         
     | 
| 
         @@ -566,6 +582,11 @@ class SRTRunner: 
     | 
|
| 
       566 
582 
     | 
    
         
             
                        lora_target_modules=lora_target_modules,
         
     | 
| 
       567 
583 
     | 
    
         
             
                        enable_lora=enable_lora,
         
     | 
| 
       568 
584 
     | 
    
         
             
                        max_loaded_loras=max_loaded_loras,
         
     | 
| 
      
 585 
     | 
    
         
            +
                        json_model_override_args=(
         
     | 
| 
      
 586 
     | 
    
         
            +
                            json.dumps(json_model_override_args)
         
     | 
| 
      
 587 
     | 
    
         
            +
                            if json_model_override_args
         
     | 
| 
      
 588 
     | 
    
         
            +
                            else "{}"
         
     | 
| 
      
 589 
     | 
    
         
            +
                        ),
         
     | 
| 
       569 
590 
     | 
    
         
             
                        lora_eviction_policy=lora_eviction_policy,
         
     | 
| 
       570 
591 
     | 
    
         
             
                        **spec_kwargs,
         
     | 
| 
       571 
592 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -594,6 +615,7 @@ class SRTRunner: 
     | 
|
| 
       594 
615 
     | 
    
         
             
                    logprob_start_len: int = 0,
         
     | 
| 
       595 
616 
     | 
    
         
             
                    top_k: Optional[int] = None,
         
     | 
| 
       596 
617 
     | 
    
         
             
                    token_ids_logprob: Optional[List[int]] = None,
         
     | 
| 
      
 618 
     | 
    
         
            +
                    dimensions: Optional[int] = None,
         
     | 
| 
       597 
619 
     | 
    
         
             
                ):
         
     | 
| 
       598 
620 
     | 
    
         
             
                    if self.is_generation:
         
     | 
| 
       599 
621 
     | 
    
         
             
                        return self.forward_generation_raw(
         
     | 
| 
         @@ -607,7 +629,9 @@ class SRTRunner: 
     | 
|
| 
       607 
629 
     | 
    
         
             
                        )
         
     | 
| 
       608 
630 
     | 
    
         
             
                    else:
         
     | 
| 
       609 
631 
     | 
    
         
             
                        if self.model_type == "embedding":
         
     | 
| 
       610 
     | 
    
         
            -
                            response = self.engine.encode( 
     | 
| 
      
 632 
     | 
    
         
            +
                            response = self.engine.encode(
         
     | 
| 
      
 633 
     | 
    
         
            +
                                prompt=prompts, image_data=image_data, dimensions=dimensions
         
     | 
| 
      
 634 
     | 
    
         
            +
                            )
         
     | 
| 
       611 
635 
     | 
    
         
             
                            if isinstance(response, list):
         
     | 
| 
       612 
636 
     | 
    
         
             
                                logits = [x["embedding"] for x in response]
         
     | 
| 
       613 
637 
     | 
    
         
             
                            else:
         
     | 
| 
         @@ -148,7 +148,7 @@ class ChatCompletionSampler(SamplerBase): 
     | 
|
| 
       148 
148 
     | 
    
         
             
                                reasoning_effort=self.reasoning_effort,
         
     | 
| 
       149 
149 
     | 
    
         
             
                                extra_body=self.extra_body,
         
     | 
| 
       150 
150 
     | 
    
         
             
                            )
         
     | 
| 
       151 
     | 
    
         
            -
                            return response.choices[0].message.content
         
     | 
| 
      
 151 
     | 
    
         
            +
                            return response.choices[0].message.content or ""
         
     | 
| 
       152 
152 
     | 
    
         
             
                        # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
         
     | 
| 
       153 
153 
     | 
    
         
             
                        except openai.BadRequestError as e:
         
     | 
| 
       154 
154 
     | 
    
         
             
                            print("Bad Request Error", e)
         
     | 
| 
         @@ -161,7 +161,9 @@ class ChatCompletionSampler(SamplerBase): 
     | 
|
| 
       161 
161 
     | 
    
         
             
                            )
         
     | 
| 
       162 
162 
     | 
    
         
             
                            time.sleep(exception_backoff)
         
     | 
| 
       163 
163 
     | 
    
         
             
                            trial += 1
         
     | 
| 
       164 
     | 
    
         
            -
             
     | 
| 
      
 164 
     | 
    
         
            +
                    # If all retries are exhausted, return empty string instead of None
         
     | 
| 
      
 165 
     | 
    
         
            +
                    print(f"All retry attempts exhausted for request. Returning empty response.")
         
     | 
| 
      
 166 
     | 
    
         
            +
                    return ""
         
     | 
| 
       165 
167 
     | 
    
         | 
| 
       166 
168 
     | 
    
         | 
| 
       167 
169 
     | 
    
         
             
            QUERY_TEMPLATE_MULTICHOICE = """
         
     | 
| 
         @@ -261,7 +263,7 @@ def format_multichoice_question(row): 
     | 
|
| 
       261 
263 
     | 
    
         
             
            def check_equality(sampler: SamplerBase, expr1: str, expr2: str):
         
     | 
| 
       262 
264 
     | 
    
         
             
                prompt = EQUALITY_TEMPLATE % {"expression1": expr1, "expression2": expr2}
         
     | 
| 
       263 
265 
     | 
    
         
             
                response = sampler([dict(content=prompt, role="user")])
         
     | 
| 
       264 
     | 
    
         
            -
                return response.lower().strip() == "yes"
         
     | 
| 
      
 266 
     | 
    
         
            +
                return (response or "").lower().strip() == "yes"
         
     | 
| 
       265 
267 
     | 
    
         | 
| 
       266 
268 
     | 
    
         | 
| 
       267 
269 
     | 
    
         
             
            def _compute_stat(values: list, stat: str):
         
     | 
| 
         @@ -80,6 +80,7 @@ class HumanEval(Eval): 
     | 
|
| 
       80 
80 
     | 
    
         
             
                    instruction = "Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n"
         
     | 
| 
       81 
81 
     | 
    
         | 
| 
       82 
82 
     | 
    
         
             
                    def find_code(completion):
         
     | 
| 
      
 83 
     | 
    
         
            +
                        completion = completion or ""
         
     | 
| 
       83 
84 
     | 
    
         
             
                        pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
         
     | 
| 
       84 
85 
     | 
    
         
             
                        matches = pattern.findall(completion)
         
     | 
| 
       85 
86 
     | 
    
         
             
                        extracted_answer = matches[0] if len(matches) >= 1 else completion
         
     | 
    
        sglang/test/simple_eval_math.py
    CHANGED
    
    | 
         @@ -54,6 +54,7 @@ class MathEval(Eval): 
     | 
|
| 
       54 
54 
     | 
    
         
             
                            sampler._pack_message(content=QUERY_TEMPLATE.format(**row), role="user")
         
     | 
| 
       55 
55 
     | 
    
         
             
                        ]
         
     | 
| 
       56 
56 
     | 
    
         
             
                        response_text = sampler(prompt_messages)
         
     | 
| 
      
 57 
     | 
    
         
            +
                        response_text = response_text or ""
         
     | 
| 
       57 
58 
     | 
    
         
             
                        match = re.search(ANSWER_PATTERN, response_text)
         
     | 
| 
       58 
59 
     | 
    
         
             
                        extracted_answer = match.group(1) if match else None
         
     | 
| 
       59 
60 
     | 
    
         
             
                        score = float(
         
     | 
    
        sglang/test/simple_eval_mmlu.py
    CHANGED
    
    | 
         @@ -101,6 +101,7 @@ class MMLUEval(Eval): 
     | 
|
| 
       101 
101 
     | 
    
         
             
                            )
         
     | 
| 
       102 
102 
     | 
    
         
             
                        ]
         
     | 
| 
       103 
103 
     | 
    
         
             
                        response_text = sampler(prompt_messages)
         
     | 
| 
      
 104 
     | 
    
         
            +
                        response_text = response_text or ""
         
     | 
| 
       104 
105 
     | 
    
         
             
                        match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
         
     | 
| 
       105 
106 
     | 
    
         
             
                        extracted_answer = match.group(1) if match else None
         
     | 
| 
       106 
107 
     | 
    
         
             
                        score = 1.0 if extracted_answer == row["Answer"] else 0.0
         
     | 
    
        sglang/test/test_utils.py
    CHANGED
    
    | 
         @@ -84,6 +84,8 @@ DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = ( 
     | 
|
| 
       84 
84 
     | 
    
         
             
            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
         
     | 
| 
       85 
85 
     | 
    
         
             
            DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
         
     | 
| 
       86 
86 
     | 
    
         
             
            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
         
     | 
| 
      
 87 
     | 
    
         
            +
            DEFAULT_EAGLE_DP_ATTENTION_TARGET_MODEL_FOR_TEST = "Qwen/Qwen3-30B-A3B"
         
     | 
| 
      
 88 
     | 
    
         
            +
            DEFAULT_EAGLE_DP_ATTENTION_DRAFT_MODEL_FOR_TEST = "Tengyunw/qwen3_30b_moe_eagle3"
         
     | 
| 
       87 
89 
     | 
    
         
             
            DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
         
     | 
| 
       88 
90 
     | 
    
         
             
            DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
         
     | 
| 
       89 
91 
     | 
    
         
             
                "meta-llama/Llama-3.1-8B-Instruct"
         
     | 
| 
         @@ -92,6 +94,10 @@ DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-I 
     | 
|
| 
       92 
94 
     | 
    
         
             
            DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
         
     | 
| 
       93 
95 
     | 
    
         | 
| 
       94 
96 
     | 
    
         
             
            # Other use cases
         
     | 
| 
      
 97 
     | 
    
         
            +
            DEFAULT_AUTOROUND_MODEL_NAME_FOR_TEST = (
         
     | 
| 
      
 98 
     | 
    
         
            +
                "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",  # auto_round:auto_gptq
         
     | 
| 
      
 99 
     | 
    
         
            +
                "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound",  # auto_round:auto_awq
         
     | 
| 
      
 100 
     | 
    
         
            +
            )
         
     | 
| 
       95 
101 
     | 
    
         
             
            DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
         
     | 
| 
       96 
102 
     | 
    
         
             
                "meta-llama/Llama-4-Scout-17B-16E-Instruct"
         
     | 
| 
       97 
103 
     | 
    
         
             
            )
         
     | 
| 
         @@ -145,7 +151,7 @@ def _use_cached_default_models(model_repo: str): 
     | 
|
| 
       145 
151 
     | 
    
         | 
| 
       146 
152 
     | 
    
         
             
            if is_in_ci():
         
     | 
| 
       147 
153 
     | 
    
         
             
                DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
         
     | 
| 
       148 
     | 
    
         
            -
                    10000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) *  
     | 
| 
      
 154 
     | 
    
         
            +
                    10000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 2000
         
     | 
| 
       149 
155 
     | 
    
         
             
                )
         
     | 
| 
       150 
156 
     | 
    
         
             
            else:
         
     | 
| 
       151 
157 
     | 
    
         
             
                DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
         
     | 
    
        sglang/version.py
    CHANGED
    
    | 
         @@ -1 +1 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            __version__ = "0.5.4. 
     | 
| 
      
 1 
     | 
    
         
            +
            __version__ = "0.5.4.post2"
         
     | 
| 
         @@ -1,6 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            Metadata-Version: 2.4
         
     | 
| 
       2 
2 
     | 
    
         
             
            Name: sglang
         
     | 
| 
       3 
     | 
    
         
            -
            Version: 0.5.4. 
     | 
| 
      
 3 
     | 
    
         
            +
            Version: 0.5.4.post2
         
     | 
| 
       4 
4 
     | 
    
         
             
            Summary: SGLang is a fast serving framework for large language models and vision language models.
         
     | 
| 
       5 
5 
     | 
    
         
             
            License:                                  Apache License
         
     | 
| 
       6 
6 
     | 
    
         
             
                                               Version 2.0, January 2004
         
     | 
| 
         @@ -234,7 +234,7 @@ Requires-Dist: ninja 
     | 
|
| 
       234 
234 
     | 
    
         
             
            Requires-Dist: numpy
         
     | 
| 
       235 
235 
     | 
    
         
             
            Requires-Dist: nvidia-cutlass-dsl==4.2.1
         
     | 
| 
       236 
236 
     | 
    
         
             
            Requires-Dist: openai-harmony==0.0.4
         
     | 
| 
       237 
     | 
    
         
            -
            Requires-Dist: openai== 
     | 
| 
      
 237 
     | 
    
         
            +
            Requires-Dist: openai==2.6.1
         
     | 
| 
       238 
238 
     | 
    
         
             
            Requires-Dist: orjson
         
     | 
| 
       239 
239 
     | 
    
         
             
            Requires-Dist: outlines==0.1.11
         
     | 
| 
       240 
240 
     | 
    
         
             
            Requires-Dist: packaging
         
     | 
| 
         @@ -256,11 +256,11 @@ Requires-Dist: sgl-kernel==0.3.16.post4 
     | 
|
| 
       256 
256 
     | 
    
         
             
            Requires-Dist: soundfile==0.13.1
         
     | 
| 
       257 
257 
     | 
    
         
             
            Requires-Dist: tiktoken
         
     | 
| 
       258 
258 
     | 
    
         
             
            Requires-Dist: timm==1.0.16
         
     | 
| 
       259 
     | 
    
         
            -
            Requires-Dist: torch==2.8.0
         
     | 
| 
       260 
259 
     | 
    
         
             
            Requires-Dist: torch_memory_saver==0.0.9
         
     | 
| 
       261 
     | 
    
         
            -
            Requires-Dist:  
     | 
| 
      
 260 
     | 
    
         
            +
            Requires-Dist: torch==2.8.0
         
     | 
| 
       262 
261 
     | 
    
         
             
            Requires-Dist: torchaudio==2.8.0
         
     | 
| 
       263 
262 
     | 
    
         
             
            Requires-Dist: torchvision
         
     | 
| 
      
 263 
     | 
    
         
            +
            Requires-Dist: torchao==0.9.0
         
     | 
| 
       264 
264 
     | 
    
         
             
            Requires-Dist: tqdm
         
     | 
| 
       265 
265 
     | 
    
         
             
            Requires-Dist: transformers==4.57.1
         
     | 
| 
       266 
266 
     | 
    
         
             
            Requires-Dist: uvicorn
         
     | 
| 
         @@ -270,8 +270,8 @@ Requires-Dist: grpcio==1.75.1 
     | 
|
| 
       270 
270 
     | 
    
         
             
            Requires-Dist: grpcio-tools==1.75.1
         
     | 
| 
       271 
271 
     | 
    
         
             
            Requires-Dist: grpcio-reflection==1.75.1
         
     | 
| 
       272 
272 
     | 
    
         
             
            Requires-Dist: grpcio-health-checking==1.75.1
         
     | 
| 
       273 
     | 
    
         
            -
            Provides-Extra:  
     | 
| 
       274 
     | 
    
         
            -
            Requires-Dist:  
     | 
| 
      
 273 
     | 
    
         
            +
            Provides-Extra: checkpoint-engine
         
     | 
| 
      
 274 
     | 
    
         
            +
            Requires-Dist: checkpoint-engine==0.1.2; extra == "checkpoint-engine"
         
     | 
| 
       275 
275 
     | 
    
         
             
            Provides-Extra: test
         
     | 
| 
       276 
276 
     | 
    
         
             
            Requires-Dist: accelerate; extra == "test"
         
     | 
| 
       277 
277 
     | 
    
         
             
            Requires-Dist: expecttest; extra == "test"
         
     | 
| 
         @@ -282,28 +282,13 @@ Requires-Dist: peft; extra == "test" 
     | 
|
| 
       282 
282 
     | 
    
         
             
            Requires-Dist: pytest; extra == "test"
         
     | 
| 
       283 
283 
     | 
    
         
             
            Requires-Dist: sentence_transformers; extra == "test"
         
     | 
| 
       284 
284 
     | 
    
         
             
            Requires-Dist: tabulate; extra == "test"
         
     | 
| 
       285 
     | 
    
         
            -
            Provides-Extra: checkpoint-engine
         
     | 
| 
       286 
     | 
    
         
            -
            Requires-Dist: checkpoint-engine==0.1.2; extra == "checkpoint-engine"
         
     | 
| 
       287 
     | 
    
         
            -
            Provides-Extra: all
         
     | 
| 
       288 
285 
     | 
    
         
             
            Provides-Extra: dev
         
     | 
| 
       289 
286 
     | 
    
         
             
            Requires-Dist: sglang[test]; extra == "dev"
         
     | 
| 
       290 
     | 
    
         
            -
            Provides-Extra: cu130
         
     | 
| 
       291 
     | 
    
         
            -
            Requires-Dist: torch==2.9.0; extra == "cu130"
         
     | 
| 
       292 
     | 
    
         
            -
            Requires-Dist: torchaudio==2.9.0; extra == "cu130"
         
     | 
| 
       293 
     | 
    
         
            -
            Requires-Dist: torchvision==0.24.0; extra == "cu130"
         
     | 
| 
       294 
     | 
    
         
            -
            Provides-Extra: cu130-all
         
     | 
| 
       295 
     | 
    
         
            -
            Requires-Dist: sglang[test]; extra == "cu130-all"
         
     | 
| 
       296 
     | 
    
         
            -
            Requires-Dist: sglang[decord]; extra == "cu130-all"
         
     | 
| 
       297 
     | 
    
         
            -
            Requires-Dist: sglang[cu130]; extra == "cu130-all"
         
     | 
| 
       298 
287 
     | 
    
         
             
            Provides-Extra: tracing
         
     | 
| 
       299 
288 
     | 
    
         
             
            Requires-Dist: opentelemetry-api; extra == "tracing"
         
     | 
| 
       300 
289 
     | 
    
         
             
            Requires-Dist: opentelemetry-exporter-otlp; extra == "tracing"
         
     | 
| 
       301 
290 
     | 
    
         
             
            Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
         
     | 
| 
       302 
291 
     | 
    
         
             
            Requires-Dist: opentelemetry-sdk; extra == "tracing"
         
     | 
| 
       303 
     | 
    
         
            -
            Provides-Extra: blackwell
         
     | 
| 
       304 
     | 
    
         
            -
            Requires-Dist: sglang[dev]; extra == "blackwell"
         
     | 
| 
       305 
     | 
    
         
            -
            Provides-Extra: blackwell-aarch64
         
     | 
| 
       306 
     | 
    
         
            -
            Requires-Dist: sglang[dev]; extra == "blackwell-aarch64"
         
     | 
| 
       307 
292 
     | 
    
         
             
            Dynamic: license-file
         
     | 
| 
       308 
293 
     | 
    
         | 
| 
       309 
294 
     | 
    
         
             
            <div align="center" id="sglangtop">
         
     | 
| 
         @@ -328,14 +313,14 @@ Dynamic: license-file 
     | 
|
| 
       328 
313 
     | 
    
         
             
            | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
         
     | 
| 
       329 
314 
     | 
    
         | 
| 
       330 
315 
     | 
    
         
             
            ## News
         
     | 
| 
       331 
     | 
    
         
            -
            - [2025/10] 🔥  
     | 
| 
      
 316 
     | 
    
         
            +
            - [2025/10] 🔥 SGLang now runs natively on TPU with the SGLang-Jax backend ([blog](https://lmsys.org/blog/2025-10-29-sglang-jax/)).
         
     | 
| 
      
 317 
     | 
    
         
            +
            - [2025/10] AMD AI Dev Day 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_amd_ai_devday_2025.pdf)), PyTorch Conference 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_pytorch_2025.pdf)).
         
     | 
| 
       332 
318 
     | 
    
         
             
            - [2025/09] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part II): 3.8x Prefill, 4.8x Decode Throughput ([blog](https://lmsys.org/blog/2025-09-25-gb200-part-2/)).
         
     | 
| 
       333 
319 
     | 
    
         
             
            - [2025/09] SGLang Day 0 Support for DeepSeek-V3.2 with Sparse Attention ([blog](https://lmsys.org/blog/2025-09-29-deepseek-V32/)).
         
     | 
| 
       334 
320 
     | 
    
         
             
            - [2025/08] SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
         
     | 
| 
       335 
321 
     | 
    
         
             
            - [2025/08] SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
         
     | 
| 
       336 
322 
     | 
    
         
             
            - [2025/05] Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
         
     | 
| 
       337 
323 
     | 
    
         
             
            - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
         
     | 
| 
       338 
     | 
    
         
            -
            - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
         
     | 
| 
       339 
324 
     | 
    
         | 
| 
       340 
325 
     | 
    
         
             
            <details>
         
     | 
| 
       341 
326 
     | 
    
         
             
            <summary>More</summary>
         
     | 
| 
         @@ -345,6 +330,7 @@ Dynamic: license-file 
     | 
|
| 
       345 
330 
     | 
    
         
             
            - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
         
     | 
| 
       346 
331 
     | 
    
         
             
            - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
         
     | 
| 
       347 
332 
     | 
    
         
             
            - [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
         
     | 
| 
      
 333 
     | 
    
         
            +
            - [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
         
     | 
| 
       348 
334 
     | 
    
         
             
            - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
         
     | 
| 
       349 
335 
     | 
    
         
             
            - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
         
     | 
| 
       350 
336 
     | 
    
         
             
            - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
         
     | 
| 
         @@ -385,7 +371,7 @@ SGLang is currently hosted under the non-profit open-source organization [LMSYS] 
     | 
|
| 
       385 
371 
     | 
    
         
             
            <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
         
     | 
| 
       386 
372 
     | 
    
         | 
| 
       387 
373 
     | 
    
         
             
            ## Contact Us
         
     | 
| 
       388 
     | 
    
         
            -
            For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at  
     | 
| 
      
 374 
     | 
    
         
            +
            For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at sglang@lmsys.org
         
     | 
| 
       389 
375 
     | 
    
         | 
| 
       390 
376 
     | 
    
         
             
            ## Acknowledgment
         
     | 
| 
       391 
377 
     | 
    
         
             
            We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
         
     |