PyPI - sglang - Versions diffs - 0.5.0rc2__tar.gz → 0.5.1__tar.gz - Mend

sglang 0.5.0rc2tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (882) hide show

{sglang-0.5.0rc2/sglang.egg-info → sglang-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.5.0rc2
+Version: 0.5.1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -232,7 +232,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
 Requires-Dist: msgspec; extra == "runtime-common"
 Requires-Dist: ninja; extra == "runtime-common"
 Requires-Dist: openai==1.99.1; extra == "runtime-common"
-Requires-Dist: openai-harmony==0.0.3; extra == "runtime-common"
+Requires-Dist: openai-harmony==0.0.4; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
 Requires-Dist: outlines==0.1.11; extra == "runtime-common"
 Requires-Dist: packaging; extra == "runtime-common"
@@ -240,9 +240,9 @@ Requires-Dist: partial_json_parser; extra == "runtime-common"
 Requires-Dist: pillow; extra == "runtime-common"
 Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
 Requires-Dist: psutil; extra == "runtime-common"
+Requires-Dist: pybase64; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: pynvml; extra == "runtime-common"
-Requires-Dist: pybase64; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: sentencepiece; extra == "runtime-common"
@@ -254,7 +254,7 @@ Requires-Dist: torchao==0.9.0; extra == "runtime-common"
 Requires-Dist: transformers==4.55.2; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.22; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
@@ -278,13 +278,12 @@ Requires-Dist: petit_kernel==0.0.2; extra == "srt-hip"
 Requires-Dist: wave-lang==1.0.1; extra == "srt-hip"
 Provides-Extra: srt-cpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
-Requires-Dist: einops; extra == "srt-cpu"
+Provides-Extra: srt-npu
+Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
 Provides-Extra: srt-hpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
-Provides-Extra: srt-npu
-Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
 Provides-Extra: openai
 Requires-Dist: openai==1.99.1; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
@@ -375,6 +374,7 @@ Dynamic: license-file
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [Register here](https://lu.ma/gbfhjvuo).
 - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
 - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).

{sglang-0.5.0rc2 → sglang-0.5.1}/README.md RENAMED Viewed

@@ -20,6 +20,7 @@
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [Register here](https://lu.ma/gbfhjvuo).
 - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
 - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).

{sglang-0.5.0rc2 → sglang-0.5.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.5.0rc2"
+version = "0.5.1"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -31,7 +31,7 @@ runtime_common = [
     "msgspec",
     "ninja",
     "openai==1.99.1",
-    "openai-harmony==0.0.3",
+    "openai-harmony==0.0.4",
     "orjson",
     "outlines==0.1.11",
     "packaging",
@@ -39,9 +39,9 @@ runtime_common = [
     "pillow",
     "prometheus-client>=0.20.0",
     "psutil",
+    "pybase64",
     "pydantic",
     "pynvml",
-    "pybase64",
     "python-multipart",
     "pyzmq>=25.1.2",
     "sentencepiece",
@@ -53,7 +53,7 @@ runtime_common = [
     "transformers==4.55.2",
     "uvicorn",
     "uvloop",
-    "xgrammar==0.1.22",
+    "xgrammar==0.1.23",
 ]
 srt = [
@@ -85,8 +85,11 @@ srt_hip = [
     "wave-lang==1.0.1",
 ]
-# CPU: torch wheel for CPU needs to be installed from https://download.pytorch.org/whl/cpu
-srt_cpu = ["sglang[runtime_common]", "einops"]
+# https://docs.sglang.ai/platforms/cpu_server.html
+srt_cpu = ["sglang[runtime_common]"]
+# https://docs.sglang.ai/platforms/ascend_npu.html
+srt_npu = ["sglang[runtime_common]"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
@@ -96,9 +99,6 @@ srt_xpu = ["sglang[runtime_common]"]
 # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
 srt_hpu = ["sglang[runtime_common]"]
-# https://vllm-ascend.readthedocs.io/en/latest/installation.html
-srt_npu = ["sglang[runtime_common]"]
 openai = ["openai==1.99.1", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/bench_one_batch.py RENAMED Viewed

@@ -61,7 +61,6 @@ from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.distributed.parallel_state import destroy_distributed_environment
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.managers.scheduler import Scheduler
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -300,11 +299,6 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
             spec_algorithm=SpeculativeAlgorithm.NONE,
             speculative_num_draft_tokens=None,
-            enable_two_batch_overlap=model_runner.server_args.enable_two_batch_overlap,
-            enable_deepep_moe=MoeA2ABackend(
-                model_runner.server_args.moe_a2a_backend
-            ).is_deepep(),
-            deepep_mode=DeepEPMode(model_runner.server_args.deepep_mode),
             require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
             disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
         )

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/bench_one_batch_server.py RENAMED Viewed

@@ -26,7 +26,7 @@ from sglang.bench_serving import get_tokenizer, sample_random_requests
 from sglang.profiler import run_profile
 from sglang.srt.entrypoints.http_server import launch_server
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_process_tree
+from sglang.srt.utils import is_blackwell, kill_process_tree
 from sglang.test.test_utils import is_in_ci, write_github_step_summary
@@ -363,7 +363,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
         acc_length,
         trace_link,
     ) in result:
-        hourly_cost = 2 * server_args.tp_size  # $2/hour for one H100
+        if is_blackwell():
+            hourly_cost_per_gpu = 4  # $4/hour for one B200
+        else:
+            hourly_cost_per_gpu = 2  # $2/hour for one H100
+        hourly_cost = hourly_cost_per_gpu * server_args.tp_size
         input_util = 0.7
         accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
         line = (

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/bench_serving.py RENAMED Viewed

@@ -864,11 +864,11 @@ def sample_mmmu_requests(
                     if image.mode == "RGBA":
                         image = image.convert("RGB")
-                    # Encode image to base64
+                    # Encode image to base64 (save as PNG to support palette/alpha modes)
                     buffered = io.BytesIO()
-                    image.save(buffered, format="JPEG")
+                    image.save(buffered, format="PNG")
                     img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
-                    image_data = f"data:image/jpeg;base64,{img_str}"
+                    image_data = f"data:image/png;base64,{img_str}"
                 else:
                     continue

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/eval/llama3_eval.py RENAMED Viewed

@@ -12,7 +12,6 @@ from dataclasses import dataclass
 import httpx
 import numpy as np
 import openai
-import transformers
 from datasets import load_dataset
 from openai import AsyncOpenAI
 from tqdm import tqdm

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -32,6 +32,7 @@ from sglang.srt.hf_transformers_utils import (
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_bool_env_var, is_hip
+from sglang.utils import is_in_ci
 logger = logging.getLogger(__name__)
@@ -166,19 +167,20 @@ class ModelConfig:
         derived_context_len = get_context_length(self.hf_text_config)
         if context_length is not None:
             if context_length > derived_context_len:
-                if get_bool_env_var(
-                    "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", default="True"
+                reason = "Target model's" if is_draft_model else "User-specified"
+                msg = (
+                    f"Warning: {reason} context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                    f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
+                )
+                if (
+                    get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN")
+                    or is_in_ci()  # FIXME: fix this special case
                 ):
-                    logger.warning(
-                        f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
-                        f"This may lead to incorrect model outputs or CUDA errors."
-                    )
+                    logger.warning(msg)
                     self.context_len = context_length
                 else:
                     raise ValueError(
-                        f"User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
-                        f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config. "
-                        f"To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
+                        f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
                     )
             else:
                 self.context_len = context_length
@@ -341,6 +343,19 @@ class ModelConfig:
                 "kv_n_heads",
                 self.hf_config.num_attention_heads,
             )
+        if self.hf_config.model_type in ["nemotron-nas"]:
+            nkvh = {
+                self.hf_config.num_attention_heads // block.attention.n_heads_in_group
+                for block in self.hf_config.block_configs
+                if not block.attention.no_op
+            }
+            if len(nkvh) == 0:
+                raise RuntimeError("Couldn't determine number of kv heads")
+            if len(nkvh) > 1:
+                raise ValueError(
+                    "Variable GQA (VGQA) is not yet supported for nemotron-nas in sglang"
+                )
+            return next(iter(nkvh))
         attributes = [
             # For Falcon:

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/configs/update_config.py RENAMED Viewed

@@ -49,14 +49,25 @@ def get_num_heads_padding_size(tp_size, weight_block_size):
 def update_intermediate_size(model_config, attr_name, intermediate_padding_size):
-    if hasattr(model_config.hf_config, attr_name):
+    attr_value = intermediate_padding_size
+    if hasattr(model_config, "hf_config") and hasattr(
+        model_config.hf_config, attr_name
+    ):
         attr_value = getattr(model_config.hf_config, attr_name)
-        if attr_value % intermediate_padding_size != 0:
-            from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+    elif hasattr(model_config, attr_name):
+        attr_value = getattr(model_config, attr_name)
+    if attr_value % intermediate_padding_size != 0:
+        from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
-            attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
+        attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
+        if hasattr(model_config, "hf_config"):
             setattr(model_config.hf_config, attr_name, attr_value)
-            setattr(model_config.hf_text_config, attr_name, attr_value)
+            if hasattr(model_config, "hf_text_config"):
+                setattr(model_config.hf_text_config, attr_name, attr_value)
+        else:
+            setattr(model_config, attr_name, attr_value)
     return model_config
@@ -118,4 +129,28 @@ def adjust_config_with_unaligned_cpu_tp(
     model_config = update_intermediate_size(
         model_config, "intermediate_size_mlp", intermediate_padding_size
     )
+    if (
+        hasattr(model_config.hf_config, "vision_config")
+        and model_config.hf_config.vision_config.model_type == "siglip_vision_model"
+    ):
+        model_config.hf_config.vision_config.original_num_attention_heads = (
+            model_config.num_attention_heads
+        )
+        if model_config.hf_config.vision_config.num_attention_heads % tp_size != 0:
+            model_config.hf_config.vision_config.head_dim = (
+                model_config.hf_config.vision_config.hidden_size
+                // model_config.hf_config.vision_config.num_attention_heads
+            )
+            from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+            pad_size = get_num_heads_padding_size(tp_size, weight_block_size)
+            model_config.hf_config.vision_config.num_attention_heads = pad_vocab_size(
+                model_config.hf_config.vision_config.num_attention_heads, pad_size
+            )
+        model_config.hf_config.vision_config = update_intermediate_size(
+            model_config.hf_config.vision_config,
+            "intermediate_size",
+            intermediate_padding_size,
+        )
     return model_config

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/constrained/xgrammar_backend.py RENAMED Viewed

@@ -32,10 +32,15 @@ from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
     BaseGrammarObject,
 )
-from sglang.srt.constrained.triton_ops.bitmask_ops import (
-    apply_token_bitmask_inplace_triton,
-)
+from sglang.srt.utils import is_hip
+_is_hip = is_hip()
+if _is_hip:
+    from sgl_kernel import apply_token_bitmask_inplace_cuda
+else:
+    from sglang.srt.constrained.triton_ops.bitmask_ops import (
+        apply_token_bitmask_inplace_triton,
+    )
 logger = logging.getLogger(__name__)
@@ -94,7 +99,10 @@ class XGrammarGrammar(BaseGrammarObject):
     def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
         if logits.device.type == "cuda":
-            apply_token_bitmask_inplace_triton(logits, vocab_mask)
+            if _is_hip:
+                apply_token_bitmask_inplace_cuda(logits, vocab_mask)
+            else:
+                apply_token_bitmask_inplace_triton(logits, vocab_mask)
         elif logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
             self.apply_vocab_mask_cpu(logits, vocab_mask)
         else:
@@ -154,12 +162,16 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
     ):
         super().__init__()
-        # Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
-        # This ensures consistency between what the model considers EOS and what XGrammar uses
-        tokenizer_info = TokenizerInfo.from_huggingface(
-            tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
-        )
-        override_stop_tokens = None
+        if hasattr(tokenizer, "init_xgrammar"):
+            # For special tokenizer
+            tokenizer_info, override_stop_tokens = tokenizer.init_xgrammar()
+        else:
+            # Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
+            # This ensures consistency between what the model considers EOS and what XGrammar uses
+            tokenizer_info = TokenizerInfo.from_huggingface(
+                tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
+            )
+            override_stop_tokens = None
         self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
         self.vocab_size = vocab_size

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/conversation.py RENAMED Viewed

@@ -625,7 +625,7 @@ def generate_chat_conv(
                         real_content += content.text
                     elif content.type == "image_url":
                         # NOTE: works for llava and intervl2_5
-                        if conv.name in ["internvl-2-5", "interns1"]:
+                        if conv.name in ["internvl-2-5"]:
                             real_content = image_token + real_content
                         else:
                             real_content += image_token
@@ -817,20 +817,7 @@ register_conv_template(
         sep_style=SeparatorStyle.MPT,
         sep="<|im_end|>\n",
         stop_str=["<|im_end|>", "<|action_end|>"],
-        image_token="<image>",
-    )
-)
-register_conv_template(
-    Conversation(
-        name="interns1",
-        system_template="<|im_start|>system\n{system_message}",
-        system_message="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).  It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
-        roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-        sep_style=SeparatorStyle.MPT,
-        sep="<|im_end|>\n",
-        stop_str=["<|im_end|>", "<|action_end|>"],
-        image_token="<image>",
+        image_token="<IMG_CONTEXT>",
     )
 )

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/disaggregation/ascend/conn.py RENAMED Viewed

@@ -23,9 +23,7 @@ class AscendKVManager(MooncakeKVManager):
         )
     def register_buffer_to_engine(self):
-        self.engine.register(
-            self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
-        )
+        self.engine.batch_register(self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens)
         # The Ascend backend optimize batch registration for small memory blocks.
         self.engine.batch_register(
             self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/disaggregation/base/conn.py RENAMED Viewed

@@ -30,6 +30,7 @@ class KVArgs:
     # for pp prefill
     prefill_pp_size: int
     pp_rank: int
+    prefill_start_layer: int
     # for system dp
     system_dp_rank: int

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/disaggregation/decode.py RENAMED Viewed

@@ -259,7 +259,7 @@ class DecodePreallocQueue:
         if len(req.origin_input_ids) > self.max_total_num_tokens:
             message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
             logger.error(message)
-            prepare_abort(req, message)
+            prepare_abort(req, message, status_code=HTTPStatus.BAD_REQUEST)
             self.scheduler.stream_output([req], req.return_logprob)
             return True
         return False

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/disaggregation/launch_lb.py RENAMED Viewed

@@ -118,7 +118,13 @@ def main():
     lb_args = LBArgs.from_cli_args(args)
     prefill_configs = [PrefillConfig(url, port) for url, port in lb_args.prefill_infos]
-    run(prefill_configs, lb_args.decode_infos, lb_args.host, lb_args.port)
+    run(
+        prefill_configs,
+        lb_args.decode_infos,
+        lb_args.host,
+        lb_args.port,
+        lb_args.timeout,
+    )
 if __name__ == "__main__":

{sglang-0.5.0rc2 → sglang-0.5.1}/sglang/srt/disaggregation/mini_lb.py RENAMED Viewed

@@ -50,10 +50,16 @@ class PrefillConfig:
 class MiniLoadBalancer:
-    def __init__(self, prefill_configs: List[PrefillConfig], decode_servers: List[str]):
+    def __init__(
+        self,
+        prefill_configs: List[PrefillConfig],
+        decode_servers: List[str],
+        timeout: int,
+    ):
         self.prefill_configs = prefill_configs
         self.prefill_servers = [p.url for p in prefill_configs]
         self.decode_servers = decode_servers
+        self.timeout = timeout
     def add_prefill_server(self, new_prefill_config: PrefillConfig):
         self.prefill_configs.append(new_prefill_config)
@@ -78,7 +84,7 @@ class MiniLoadBalancer:
         async with aiohttp.ClientSession(
             timeout=aiohttp.ClientTimeout(
-                total=3600
+                total=self.timeout
             )  # Add timeout for request reliability
         ) as session:
             tasks = [
@@ -117,7 +123,7 @@ class MiniLoadBalancer:
         async def stream_results():
             async with aiohttp.ClientSession(
                 timeout=aiohttp.ClientTimeout(
-                    total=3600
+                    total=self.timeout
                 )  # Add timeout for request reliability
             ) as session:
                 # Create the tasks for both prefill and decode requests
@@ -401,9 +407,9 @@ async def register(obj: PDRegistryRequest):
     return Response(status_code=200)
-def run(prefill_configs, decode_addrs, host, port):
+def run(prefill_configs, decode_addrs, host, port, timeout):
     global load_balancer
-    load_balancer = MiniLoadBalancer(prefill_configs, decode_addrs)
+    load_balancer = MiniLoadBalancer(prefill_configs, decode_addrs, timeout=timeout)
     uvicorn.run(app, host=host, port=port)

sglang 0.5.0rc2__tar.gz → 0.5.1__tar.gz

sglang 0.5.0rc2tar.gz → 0.5.1tar.gz