PyPI - sglang - Versions diffs - 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl - Mend

sglang 0.5.1.post2py3-none-any.whl → 0.5.2rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

sglang/bench_one_batch.py +3 -0
sglang/bench_one_batch_server.py +79 -53
sglang/bench_serving.py +186 -14
sglang/profiler.py +0 -1
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +12 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/conversation.py +38 -5
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/launch_lb.py +0 -13
sglang/srt/disaggregation/mini_lb.py +33 -8
sglang/srt/disaggregation/prefill.py +1 -1
sglang/srt/distributed/parallel_state.py +24 -14
sglang/srt/entrypoints/engine.py +19 -12
sglang/srt/entrypoints/http_server.py +174 -34
sglang/srt/entrypoints/openai/protocol.py +87 -24
sglang/srt/entrypoints/openai/serving_chat.py +50 -9
sglang/srt/entrypoints/openai/serving_completions.py +15 -0
sglang/srt/eplb/eplb_manager.py +26 -2
sglang/srt/eplb/expert_distribution.py +29 -2
sglang/srt/function_call/deepseekv31_detector.py +222 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/gpt_oss_detector.py +144 -256
sglang/srt/harmony_parser.py +588 -0
sglang/srt/hf_transformers_utils.py +26 -7
sglang/srt/layers/activation.py +12 -0
sglang/srt/layers/attention/ascend_backend.py +374 -136
sglang/srt/layers/attention/flashattention_backend.py +241 -7
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
sglang/srt/layers/communicator.py +1 -2
sglang/srt/layers/layernorm.py +28 -3
sglang/srt/layers/linear.py +3 -2
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/cutlass_moe.py +0 -8
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +13 -13
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/topk.py +35 -12
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
sglang/srt/layers/quantization/fp8.py +2 -1
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +7 -0
sglang/srt/layers/quantization/mxfp4.py +25 -27
sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -3
sglang/srt/layers/rotary_embedding.py +28 -1
sglang/srt/layers/sampler.py +29 -5
sglang/srt/layers/utils.py +0 -14
sglang/srt/managers/cache_controller.py +237 -204
sglang/srt/managers/detokenizer_manager.py +48 -2
sglang/srt/managers/io_struct.py +57 -0
sglang/srt/managers/mm_utils.py +5 -1
sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
sglang/srt/managers/scheduler.py +94 -9
sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/tokenizer_manager.py +122 -42
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +51 -23
sglang/srt/mem_cache/hiradix_cache.py +87 -71
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +77 -14
sglang/srt/mem_cache/memory_pool_host.py +4 -5
sglang/srt/mem_cache/radix_cache.py +6 -4
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
sglang/srt/mem_cache/swa_radix_cache.py +1 -1
sglang/srt/model_executor/model_runner.py +6 -5
sglang/srt/model_loader/loader.py +15 -24
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/models/deepseek_v2.py +38 -13
sglang/srt/models/gpt_oss.py +2 -15
sglang/srt/models/llama_eagle3.py +4 -0
sglang/srt/models/longcat_flash.py +1015 -0
sglang/srt/models/longcat_flash_nextn.py +691 -0
sglang/srt/models/qwen2.py +26 -3
sglang/srt/models/qwen2_5_vl.py +66 -41
sglang/srt/models/qwen2_moe.py +22 -2
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/reasoning_parser.py +56 -300
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/server_args.py +122 -56
sglang/srt/speculative/eagle_worker.py +28 -8
sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
sglang/srt/utils.py +73 -5
sglang/test/attention/test_trtllm_mla_backend.py +12 -3
sglang/version.py +1 -1
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -25,7 +25,6 @@ from typing import List, Literal, Optional, Union
 from sglang.srt.function_call.function_call_parser import FunctionCallParser
 from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
-from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
 from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
@@ -39,6 +38,8 @@ from sglang.srt.utils import (
     is_hip,
     is_port_available,
     is_remote_url,
+    is_sm90_supported,
+    is_sm100_supported,
     is_triton_kernels_available,
     is_valid_ipv6_address,
     nullable_str,
@@ -47,12 +48,87 @@ from sglang.srt.utils import (
 logger = logging.getLogger(__name__)
+# Define constants
+LOAD_FORMAT_CHOICES = [
+    "auto",
+    "pt",
+    "safetensors",
+    "npcache",
+    "dummy",
+    "sharded_state",
+    "gguf",
+    "bitsandbytes",
+    "layered",
+    "remote",
+]
+QUANTIZATION_CHOICES = [
+    "awq",
+    "fp8",
+    "gptq",
+    "marlin",
+    "gptq_marlin",
+    "awq_marlin",
+    "bitsandbytes",
+    "gguf",
+    "modelopt",
+    "modelopt_fp4",
+    "petit_nvfp4",
+    "w8a8_int8",
+    "w8a8_fp8",
+    "moe_wna16",
+    "qoq",
+    "w4afp8",
+    "mxfp4",
+]
+ATTENTION_BACKEND_CHOICES = [
+    # Common
+    "triton",
+    "torch_native",
+    # NVIDIA specific
+    "cutlass_mla",
+    "fa3",
+    "flashinfer",
+    "flashmla",
+    "trtllm_mla",
+    "trtllm_mha",
+    "dual_chunk_flash_attn",
+    # AMD specific
+    "aiter",
+    "wave",
+    # Other platforms
+    "intel_amx",
+    "ascend",
+]
+DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
+# Allow external code to add more choices
+def add_load_format_choices(choices):
+    LOAD_FORMAT_CHOICES.extend(choices)
+def add_quantization_method_choices(choices):
+    QUANTIZATION_CHOICES.extend(choices)
+def add_attention_backend_choices(choices):
+    ATTENTION_BACKEND_CHOICES.extend(choices)
+def add_disagg_transfer_backend_choices(choices):
+    DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
 @dataclasses.dataclass
 class ServerArgs:
     # Model and tokenizer
     model_path: str
     tokenizer_path: Optional[str] = None
     tokenizer_mode: str = "auto"
+    tokenizer_worker_num: int = 1
     skip_tokenizer_init: bool = False
     load_format: str = "auto"
     model_loader_extra_config: str = "{}"
@@ -199,6 +275,7 @@ class ServerArgs:
     eplb_algorithm: str = "auto"
     eplb_rebalance_num_iterations: int = 1000
     eplb_rebalance_layers_per_chunk: Optional[int] = None
+    eplb_min_rebalancing_utilization_threshold: float = 1.0
     expert_distribution_recorder_mode: Optional[
         Literal["stat", "stat_approx", "per_pass", "per_token"]
     ] = None
@@ -211,11 +288,12 @@ class ServerArgs:
     enable_hierarchical_cache: bool = False
     hicache_ratio: float = 2.0
     hicache_size: int = 0
-    hicache_write_policy: str = "write_through_selective"
+    hicache_write_policy: str = "write_through"
     hicache_io_backend: str = "kernel"
     hicache_mem_layout: str = "layer_first"
     hicache_storage_backend: Optional[str] = None
     hicache_storage_prefetch_policy: str = "best_effort"
+    hicache_storage_backend_extra_config: Optional[str] = None
     # Double Sparsity
     enable_double_sparsity: bool = False
@@ -671,6 +749,15 @@ class ServerArgs:
                 )
                 self.speculative_num_draft_tokens = self.speculative_num_steps + 1
+            if (
+                self.speculative_eagle_topk > 1
+                and self.page_size > 1
+                and self.attention_backend != "flashinfer"
+            ):
+                raise ValueError(
+                    "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
+                )
             # The token generated from the verify step is counted.
             # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
             # assert self.speculative_num_steps < self.speculative_num_draft_tokens
@@ -741,6 +828,12 @@ class ServerArgs:
             default=ServerArgs.tokenizer_path,
             help="The path of the tokenizer.",
         )
+        parser.add_argument(
+            "--tokenizer-worker-num",
+            type=int,
+            default=ServerArgs.tokenizer_worker_num,
+            help="The worker num of the tokenizer manager.",
+        )
         parser.add_argument(
             "--tokenizer-mode",
             type=str,
@@ -759,18 +852,7 @@ class ServerArgs:
             "--load-format",
             type=str,
             default=ServerArgs.load_format,
-            choices=[
-                "auto",
-                "pt",
-                "safetensors",
-                "npcache",
-                "dummy",
-                "sharded_state",
-                "gguf",
-                "bitsandbytes",
-                "layered",
-                "remote",
-            ],
+            choices=LOAD_FORMAT_CHOICES,
             help="The format of the model weights to load. "
             '"auto" will try to load the weights in the safetensors format '
             "and fall back to the pytorch bin format if safetensors format "
@@ -889,25 +971,7 @@ class ServerArgs:
             "--quantization",
             type=str,
             default=ServerArgs.quantization,
-            choices=[
-                "awq",
-                "fp8",
-                "gptq",
-                "marlin",
-                "gptq_marlin",
-                "awq_marlin",
-                "bitsandbytes",
-                "gguf",
-                "modelopt",
-                "modelopt_fp4",
-                "petit_nvfp4",
-                "w8a8_int8",
-                "w8a8_fp8",
-                "moe_wna16",
-                "qoq",
-                "w4afp8",
-                "mxfp4",
-            ],
+            choices=QUANTIZATION_CHOICES,
             help="The quantization method.",
         )
         parser.add_argument(
@@ -1357,43 +1421,24 @@ class ServerArgs:
         )
         # Kernel backend
-        ATTN_BACKENDS = [
-            # Common
-            "triton",
-            "torch_native",
-            # NVIDIA specific
-            "cutlass_mla",
-            "fa3",
-            "flashinfer",
-            "flashmla",
-            "trtllm_mla",
-            "trtllm_mha",
-            "dual_chunk_flash_attn",
-            # AMD specific
-            "aiter",
-            "wave",
-            # Other platforms
-            "intel_amx",
-            "ascend",
-        ]
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=ATTN_BACKENDS,
+            choices=ATTENTION_BACKEND_CHOICES,
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
         parser.add_argument(
             "--prefill-attention-backend",
             type=str,
-            choices=ATTN_BACKENDS,
+            choices=ATTENTION_BACKEND_CHOICES,
             default=ServerArgs.prefill_attention_backend,
             help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
         )
         parser.add_argument(
             "--decode-attention-backend",
             type=str,
-            choices=ATTN_BACKENDS,
+            choices=ATTENTION_BACKEND_CHOICES,
             default=ServerArgs.decode_attention_backend,
             help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
         )
@@ -1558,6 +1603,12 @@ class ServerArgs:
             default=ServerArgs.eplb_rebalance_layers_per_chunk,
             help="Number of layers to rebalance per forward pass.",
         )
+        parser.add_argument(
+            "--eplb-min-rebalancing-utilization-threshold",
+            type=float,
+            default=ServerArgs.eplb_min_rebalancing_utilization_threshold,
+            help="Minimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].",
+        )
         parser.add_argument(
             "--expert-distribution-recorder-mode",
             type=str,
@@ -1641,6 +1692,12 @@ class ServerArgs:
             default=ServerArgs.hicache_storage_prefetch_policy,
             help="Control when prefetching from the storage backend should stop.",
         )
+        parser.add_argument(
+            "--hicache-storage-backend-extra-config",
+            type=str,
+            default=ServerArgs.hicache_storage_backend_extra_config,
+            help="A dictionary in JSON string format containing extra configuration for the storage backend.",
+        )
         # Double Sparsity
         parser.add_argument(
@@ -1951,7 +2008,7 @@ class ServerArgs:
             "--disaggregation-transfer-backend",
             type=str,
             default=ServerArgs.disaggregation_transfer_backend,
-            choices=["mooncake", "nixl", "ascend"],
+            choices=DISAGG_TRANSFER_BACKEND_CHOICES,
             help="The backend for disaggregation transfer. Default is mooncake.",
         )
         parser.add_argument(
@@ -2126,6 +2183,9 @@ class ServerArgs:
                 self.chunked_prefill_size % self.page_size == 0
             ), "chunked_prefill_size must be divisible by page_size"
+        # Check multi tokenizer
+        assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
     def check_lora_server_args(self):
         assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
@@ -2271,6 +2331,7 @@ class ServerArgs:
             if is_mxfp4_quant_format:
                 # use bf16 for mxfp4 triton kernels
                 self.dtype = "bfloat16"
         elif "Llama4" in model_arch:
             assert self.attention_backend in {
                 "fa3",
@@ -2368,6 +2429,9 @@ class PortArgs:
     # The ipc filename for Scheduler to send metrics
     metrics_ipc_name: str
+    # The ipc filename for Tokenizer and worker tokenizer
+    tokenizer_worker_ipc_name: Optional[str]
     @staticmethod
     def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
         if server_args.nccl_port is None:
@@ -2391,6 +2455,7 @@ class PortArgs:
                 nccl_port=nccl_port,
                 rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
                 metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                tokenizer_worker_ipc_name=None,
             )
         else:
             # DP attention. Use TCP + port to handle both single-node and multi-node.
@@ -2424,6 +2489,7 @@ class PortArgs:
                 nccl_port=nccl_port,
                 rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
                 metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
+                tokenizer_worker_ipc_name=None,
             )

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -46,6 +46,7 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
     empty_context,
     get_available_gpu_memory,
+    get_bool_env_var,
     is_cuda,
     next_power_of_2,
 )
@@ -54,6 +55,7 @@ if is_cuda():
     from sgl_kernel import segment_packbits
 logger = logging.getLogger(__name__)
+RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
 @contextmanager
@@ -137,8 +139,15 @@ class EAGLEWorker(TpModelWorker):
         embed, head = self.target_worker.model_runner.model.get_embed_and_head()
         if self.speculative_algorithm.is_eagle3():
-            # EAGLE3 models don't share lm_head
-            self.draft_model_runner.model.set_embed(embed)
+            # most cases EAGLE3 models don't share lm_head
+            # but some models (e.g. nvidia/gpt-oss-120b-Eagle3) shares
+            if (
+                hasattr(self.draft_model_runner.model, "load_lm_head_from_target")
+                and self.draft_model_runner.model.load_lm_head_from_target
+            ):
+                self.draft_model_runner.model.set_embed_and_head(embed, head)
+            else:
+                self.draft_model_runner.model.set_embed(embed)
             # grab hot token ids
             if self.draft_model_runner.model.hot_token_id is not None:
@@ -781,15 +790,20 @@ class EAGLEWorker(TpModelWorker):
         token_ids_logprobs = batch.token_ids_logprobs
         accepted_indices = res.accepted_indices
         assert len(accepted_indices) == len(logits_output.next_token_logits)
         temperatures = batch.sampling_info.temperatures
         num_draft_tokens = batch.spec_info.draft_token_num
         # acceptance indices are the indices in a "flattened" batch.
         # dividing it to num_draft_tokens will yield the actual batch index.
         temperatures = temperatures[accepted_indices // num_draft_tokens]
-        logprobs = torch.nn.functional.log_softmax(
-            logits_output.next_token_logits / temperatures, dim=-1
-        )
+        if RETURN_ORIGINAL_LOGPROB:
+            logprobs = torch.nn.functional.log_softmax(
+                logits_output.next_token_logits, dim=-1
+            )
+        else:
+            logprobs = torch.nn.functional.log_softmax(
+                logits_output.next_token_logits / temperatures, dim=-1
+            )
         batch_next_token_ids = res.verified_id
         num_tokens_per_req = [accept + 1 for accept in res.accept_length_per_req_cpu]
@@ -806,13 +820,19 @@ class EAGLEWorker(TpModelWorker):
             (
                 logits_output.next_token_top_logprobs_val,
                 logits_output.next_token_top_logprobs_idx,
-            ) = get_top_logprobs(logprobs, top_logprobs_nums_repeat_interleaved)
+            ) = get_top_logprobs(
+                logprobs,
+                top_logprobs_nums_repeat_interleaved,
+            )
         if any(x is not None for x in token_ids_logprobs):
             (
                 logits_output.next_token_token_ids_logprobs_val,
                 logits_output.next_token_token_ids_logprobs_idx,
-            ) = get_token_ids_logprobs(logprobs, token_ids_logprobs_repeat_interleaved)
+            ) = get_token_ids_logprobs(
+                logprobs,
+                token_ids_logprobs_repeat_interleaved,
+            )
         logits_output.next_token_logprobs = logprobs[
             torch.arange(len(batch_next_token_ids), device=batch.sampling_info.device),

sglang/srt/tokenizer/tiktoken_tokenizer.py CHANGED Viewed

@@ -121,7 +121,12 @@ class TiktokenTokenizer:
         return self.tokenizer.decode_batch(batch)
     def apply_chat_template(
-        self, messages, tokenize, add_generation_prompt, tools=None
+        self,
+        messages,
+        tokenize,
+        add_generation_prompt,
+        tools=None,
+        reasoning_effort=None,
     ):
         ret = self.chat_template_jinja.render(
             messages=messages, add_generation_prompt=add_generation_prompt

sglang/srt/utils.py CHANGED Viewed

@@ -172,6 +172,20 @@ def is_blackwell():
     return torch.cuda.get_device_capability()[0] == 10
+@lru_cache(maxsize=1)
+def is_sm100_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 10) and (
+        torch.version.cuda >= "12.8"
+    )
+@lru_cache(maxsize=1)
+def is_sm90_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 9) and (
+        torch.version.cuda >= "12.3"
+    )
 _warned_bool_env_var_keys = set()
@@ -1665,9 +1679,29 @@ def direct_register_custom_op(
     IMPORTANT: the lifetime of the operator is tied to the lifetime of the
     library object. If you want to bind the operator to a different library,
     make sure the library object is alive when the operator is used.
+    Note: This function will silently skip registration if the operator
+    with the same name is already registered to avoid RuntimeError in
+    multi-engine scenarios (e.g., VERL framework).
     """
     import torch.library
+    my_lib = target_lib or sglang_lib
+    # Check if operator is already registered to avoid duplicate registration
+    # This is important for scenarios where multiple SGLang engines run in the same process
+    try:
+        # Try to access the operator to see if it's already registered
+        lib_name = my_lib.m.name if hasattr(my_lib.m, "name") else "sglang"
+        if hasattr(torch.ops, lib_name) and hasattr(
+            getattr(torch.ops, lib_name), op_name
+        ):
+            # Operator already exists, skip registration
+            return
+    except (AttributeError, RuntimeError):
+        # Operator doesn't exist, proceed with registration
+        pass
     if hasattr(torch.library, "infer_schema"):
         schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
     else:
@@ -1676,11 +1710,22 @@ def direct_register_custom_op(
         schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
-    my_lib = target_lib or sglang_lib
-    my_lib.define(op_name + schema_str)
-    my_lib.impl(op_name, op_func, "CUDA")
-    if fake_impl is not None:
-        my_lib._register_fake(op_name, fake_impl)
+    try:
+        my_lib.define(op_name + schema_str)
+        my_lib.impl(op_name, op_func, "CUDA")
+        if fake_impl is not None:
+            my_lib._register_fake(op_name, fake_impl)
+    except RuntimeError as error:
+        if "Tried to register an operator" in str(e) and "multiple times" in str(e):
+            # Silently ignore duplicate registration errors
+            # This can happen in multi-engine scenarios
+            pass
+        else:
+            # Re-raise other RuntimeErrors
+            raise error
+    except AttributeError as error:
+        # Always re-raise AttributeError as it indicates missing dependencies
+        raise error
 def set_gpu_proc_affinity(
@@ -1919,6 +1964,15 @@ def get_ip() -> str:
     except Exception:
         pass
+    # try  using hostname
+    hostname = socket.gethostname()
+    try:
+        ip_addr = socket.gethostbyname(hostname)
+        warnings.warn("using local ip address: {}".format(ip_addr))
+        return ip_addr
+    except Exception:
+        pass
     warnings.warn(
         "Failed to get the IP address, using 0.0.0.0 by default."
         "The value can be set by the environment variable"
@@ -2733,6 +2787,20 @@ def lru_cache_frozenset(maxsize=128):
     return decorator
+def get_worker_ids_from_req_rids(rids):
+    if isinstance(rids, list):
+        worker_ids = [int(rid.split("_")[0]) for rid in rids]
+    elif isinstance(rids, str):
+        worker_ids = [int(rids.split("_")[0])]
+    else:
+        worker_ids = []
+    return worker_ids
+def get_origin_rid(rid):
+    return rid.split("_", 1)[1] if "_" in rid else rid
 def apply_module_patch(target_module, target_function, wrappers):
     original_module, original_function = parse_module_path(
         target_module, target_function, False

sglang/test/attention/test_trtllm_mla_backend.py CHANGED Viewed

@@ -208,6 +208,15 @@ class MockModelRunner:
         self.kv_cache_dtype = config["kv_cache_dtype"]
         self.page_size = config["page_size"]
+        # Server args stub - needed by attention backends
+        self.server_args = type(
+            "ServerArgs",
+            (),
+            {
+                "enable_dp_attention": False,  # Default value for testing
+            },
+        )
         # Model-config stub with MLA attributes
         self.model_config = type(
             "ModelConfig",
@@ -833,7 +842,7 @@ class TestTRTLLMMLA(CustomTestCase):
                 # Test workspace properties
                 self.assertEqual(metadata.workspace.device.type, "cuda")
-                self.assertEqual(metadata.workspace.dtype, torch.int8)
+                self.assertEqual(metadata.workspace.dtype, torch.uint8)
                 self.assertGreater(
                     metadata.workspace.numel(), 0, "Workspace should have non-zero size"
                 )
@@ -993,8 +1002,8 @@ class TestTRTLLMMLA(CustomTestCase):
         )
         # Verify CUDA graph buffers are allocated
-        self.assertIsNotNone(backend.cuda_graph_kv_indices)
-        self.assertIsNotNone(backend.cuda_graph_workspace)
+        self.assertIsNotNone(backend.decode_cuda_graph_kv_indices)
+        self.assertIsNotNone(backend.decode_cuda_graph_workspace)
         # Test capture metadata
         seq_lens = torch.full(

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.~~1.post2~~"
1	+ __version__ = "0.5.2rc0"

{sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.5.1.post2
+Version: 0.5.2rc0
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -251,18 +251,18 @@ Requires-Dist: scipy; extra == "runtime-common"
 Requires-Dist: timm==1.0.16; extra == "runtime-common"
 Requires-Dist: tiktoken; extra == "runtime-common"
 Requires-Dist: torchao==0.9.0; extra == "runtime-common"
-Requires-Dist: transformers==4.55.2; extra == "runtime-common"
+Requires-Dist: transformers==4.56.0; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
+Requires-Dist: sgl-kernel==0.3.7.post1; extra == "srt"
 Requires-Dist: torch==2.8.0; extra == "srt"
 Requires-Dist: torchaudio==2.8.0; extra == "srt"
 Requires-Dist: torchvision; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.14.post1; extra == "srt"
+Requires-Dist: flashinfer_python==0.3.0; extra == "srt"
 Provides-Extra: blackwell
 Requires-Dist: sglang[runtime_common]; extra == "blackwell"
 Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
 Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
 Requires-Dist: torchvision; extra == "blackwell"
 Requires-Dist: cuda-python; extra == "blackwell"
-Requires-Dist: flashinfer_python==0.2.14.post1; extra == "blackwell"
+Requires-Dist: flashinfer_python==0.3.0; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -304,6 +304,7 @@ Requires-Dist: pandas; extra == "test"
 Requires-Dist: peft; extra == "test"
 Requires-Dist: sentence_transformers; extra == "test"
 Requires-Dist: pytest; extra == "test"
+Requires-Dist: tabulate; extra == "test"
 Provides-Extra: all
 Requires-Dist: sglang[srt]; extra == "all"
 Requires-Dist: sglang[openai]; extra == "all"
@@ -374,7 +375,7 @@ Dynamic: license-file
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
-- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
+- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
 - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
 - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).

sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

sglang 0.5.1.post2py3-none-any.whl → 0.5.2rc0py3-none-any.whl