PyPI - sglang - Versions diffs - 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl - Mend

sglang 0.4.5py3-none-any.whl → 0.4.5.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +23 -2
sglang/bench_serving.py +6 -4
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/configs/model_config.py +37 -5
sglang/srt/constrained/base_grammar_backend.py +26 -5
sglang/srt/constrained/llguidance_backend.py +1 -0
sglang/srt/constrained/outlines_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +27 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/base/__init__.py +8 -0
sglang/srt/disaggregation/base/conn.py +113 -0
sglang/srt/disaggregation/decode.py +80 -11
sglang/srt/disaggregation/mini_lb.py +58 -123
sglang/srt/disaggregation/mooncake/__init__.py +6 -0
sglang/srt/disaggregation/mooncake/conn.py +585 -0
sglang/srt/disaggregation/mooncake/transfer_engine.py +77 -0
sglang/srt/disaggregation/prefill.py +82 -22
sglang/srt/disaggregation/utils.py +46 -0
sglang/srt/entrypoints/EngineBase.py +53 -0
sglang/srt/entrypoints/engine.py +36 -8
sglang/srt/entrypoints/http_server.py +37 -8
sglang/srt/entrypoints/http_server_engine.py +142 -0
sglang/srt/entrypoints/verl_engine.py +42 -13
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +430 -257
sglang/srt/layers/attention/flashinfer_backend.py +18 -9
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/dp_attention.py +2 -4
sglang/srt/layers/elementwise.py +15 -2
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +18 -3
sglang/srt/layers/moe/ep_moe/layer.py +15 -29
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -34
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/router.py +7 -1
sglang/srt/layers/moe/topk.py +63 -45
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +13 -5
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +12 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -77
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/fp8.py +131 -136
sglang/srt/layers/quantization/fp8_kernel.py +328 -46
sglang/srt/layers/quantization/fp8_utils.py +206 -253
sglang/srt/layers/quantization/kv_cache.py +43 -52
sglang/srt/layers/quantization/modelopt_quant.py +271 -4
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +156 -4
sglang/srt/layers/quantization/w8a8_int8.py +8 -7
sglang/srt/layers/radix_attention.py +28 -1
sglang/srt/layers/rotary_embedding.py +15 -3
sglang/srt/layers/sampler.py +5 -10
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +255 -97
sglang/srt/managers/mm_utils.py +7 -5
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +117 -79
sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
sglang/srt/managers/schedule_batch.py +64 -25
sglang/srt/managers/scheduler.py +80 -82
sglang/srt/managers/tokenizer_manager.py +18 -3
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -1
sglang/srt/mem_cache/memory_pool.py +21 -3
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +9 -6
sglang/srt/model_executor/forward_batch_info.py +234 -15
sglang/srt/model_executor/model_runner.py +67 -35
sglang/srt/model_loader/loader.py +31 -4
sglang/srt/model_loader/weight_utils.py +4 -2
sglang/srt/models/baichuan.py +2 -0
sglang/srt/models/bert.py +398 -0
sglang/srt/models/chatglm.py +1 -0
sglang/srt/models/commandr.py +1 -0
sglang/srt/models/dbrx.py +1 -0
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +494 -366
sglang/srt/models/exaone.py +1 -0
sglang/srt/models/gemma.py +1 -0
sglang/srt/models/gemma2.py +1 -0
sglang/srt/models/gemma3_causal.py +1 -0
sglang/srt/models/gpt2.py +1 -0
sglang/srt/models/gpt_bigcode.py +1 -0
sglang/srt/models/granite.py +1 -0
sglang/srt/models/grok.py +1 -0
sglang/srt/models/internlm2.py +1 -0
sglang/srt/models/llama.py +6 -5
sglang/srt/models/llama4.py +101 -34
sglang/srt/models/minicpm.py +1 -0
sglang/srt/models/minicpm3.py +30 -200
sglang/srt/models/mixtral.py +1 -0
sglang/srt/models/mixtral_quant.py +1 -0
sglang/srt/models/mllama.py +51 -8
sglang/srt/models/mllama4.py +102 -29
sglang/srt/models/olmo.py +1 -0
sglang/srt/models/olmo2.py +1 -0
sglang/srt/models/olmoe.py +1 -0
sglang/srt/models/phi3_small.py +1 -0
sglang/srt/models/qwen.py +1 -0
sglang/srt/models/qwen2.py +5 -1
sglang/srt/models/qwen2_5_vl.py +35 -70
sglang/srt/models/qwen2_moe.py +15 -13
sglang/srt/models/qwen2_vl.py +27 -25
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/models/stablelm.py +1 -0
sglang/srt/models/xverse.py +1 -0
sglang/srt/models/xverse_moe.py +1 -0
sglang/srt/openai_api/adapter.py +4 -1
sglang/srt/patch_torch.py +11 -0
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +55 -19
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
sglang/srt/speculative/eagle_utils.py +1 -11
sglang/srt/speculative/eagle_worker.py +10 -9
sglang/srt/utils.py +136 -10
sglang/test/attention/test_flashattn_backend.py +259 -221
sglang/test/attention/test_flashattn_mla_backend.py +285 -0
sglang/test/attention/test_prefix_chunk_info.py +224 -0
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +224 -0
sglang/test/test_custom_ops.py +1 -1
sglang/test/test_utils.py +19 -8
sglang/version.py +1 -1
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +15 -5
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +162 -147
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/disaggregation/conn.py +0 -81
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -155,7 +155,7 @@ class ServerArgs:
     enable_nccl_nvls: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
-    disable_mla: bool = False
+    enable_llama4_multimodal: Optional[bool] = None
     disable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
@@ -179,12 +179,12 @@ class ServerArgs:
     tool_call_parser: Optional[str] = None
     enable_hierarchical_cache: bool = False
     hicache_ratio: float = 2.0
-    enable_flashinfer_mla: bool = False  # TODO: remove this argument
-    enable_flashmla: bool = False
     flashinfer_mla_disable_ragged: bool = False
     warmups: Optional[str] = None
+    moe_dense_tp_size: Optional[int] = None
     n_share_experts_fusion: int = 0
-    disable_shared_experts_fusion: bool = False
+    disable_chunked_prefix_cache: bool = False
+    disable_fast_image_processor: bool = False
     # Debug tensor dumps
     debug_tensor_dump_output_folder: Optional[str] = None
@@ -194,6 +194,8 @@ class ServerArgs:
     # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
     disaggregation_mode: str = "null"
     disaggregation_bootstrap_port: int = 8998
+    disaggregation_transfer_backend: str = "mooncake"
+    disaggregation_ib_device: Optional[str] = None
     def __post_init__(self):
         # Expert parallelism
@@ -226,9 +228,6 @@ class ServerArgs:
             # GPU memory is not known yet or no GPU is available.
             gpu_mem = None
-        if is_hip():
-            self.disable_shared_experts_fusion = True
         # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
@@ -251,7 +250,12 @@ class ServerArgs:
         assert self.chunked_prefill_size % self.page_size == 0
-        if self.enable_flashmla is True:
+        assert self.moe_dense_tp_size in {
+            1,
+            None,
+        }, f"moe_dense_tp_size only support 1 and None currently"
+        if self.attention_backend == "flashmla":
             logger.warning(
                 "FlashMLA only supports a page_size of 64, change page_size to 64."
             )
@@ -294,6 +298,8 @@ class ServerArgs:
                 f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )
+        self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
         # Data parallelism attention
         if self.enable_dp_attention:
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
@@ -386,6 +392,10 @@ class ServerArgs:
         os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
             "1" if self.enable_torch_compile else "0"
         )
+        # Set env var before grammar backends init
+        os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
+            "1" if self.disable_outlines_disk_cache else "0"
+        )
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -495,6 +505,7 @@ class ServerArgs:
                 "bitsandbytes",
                 "gguf",
                 "modelopt",
+                "modelopt_fp4",
                 "w8a8_int8",
                 "w8a8_fp8",
                 "moe_wna16",
@@ -817,7 +828,7 @@ class ServerArgs:
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=["flashinfer", "triton", "torch_native", "fa3"],
+            choices=["flashinfer", "triton", "torch_native", "fa3", "flashmla"],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
@@ -837,13 +848,13 @@ class ServerArgs:
         )
         parser.add_argument(
             "--enable-flashinfer-mla",
-            action="store_true",
-            help="Enable FlashInfer MLA optimization. This argument will be deprecated soon! Please use '--attention-backend flashinfer' instead for switching on flashfiner mla!",
+            action=DeprecatedAction,
+            help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
         )
         parser.add_argument(
             "--enable-flashmla",
-            action="store_true",
-            help="Enable FlashMLA decode optimization",
+            action=DeprecatedAction,
+            help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
         )
         parser.add_argument(
             "--flashinfer-mla-disable-ragged",
@@ -969,9 +980,10 @@ class ServerArgs:
             help="Disable the custom all-reduce kernel and fall back to NCCL.",
         )
         parser.add_argument(
-            "--disable-mla",
+            "--enable-llama4-multimodal",
+            default=ServerArgs.enable_llama4_multimodal,
             action="store_true",
-            help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
+            help="Enable the multimodal functionality for Llama-4.",
         )
         parser.add_argument(
             "--disable-overlap-schedule",
@@ -1096,10 +1108,17 @@ class ServerArgs:
             action="store_true",
             help="Enabling DeepEP MoE implementation for EP MoE.",
         )
+        parser.add_argument(
+            "--moe-dense-tp-size",
+            type=int,
+            default=ServerArgs.moe_dense_tp_size,
+            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
+        )
         parser.add_argument(
             "--deepep-mode",
             type=str,
             choices=["normal", "low_latency", "auto"],
+            default="auto",
             help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
         )
@@ -1107,13 +1126,18 @@ class ServerArgs:
             "--n-share-experts-fusion",
             type=int,
             default=0,
-            help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1 "
-            "we use tp_size by default.",
+            help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
+            "set it to tp_size can get best optimized performace.",
         )
         parser.add_argument(
-            "--disable-shared-experts-fusion",
+            "--disable-chunked-prefix-cache",
             action="store_true",
-            help="Disable shared experts fusion by setting n_share_experts_fusion to 0.",
+            help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
+        )
+        parser.add_argument(
+            "--disable-fast-image-processor",
+            action="store_true",
+            help="Adopt base image processor instead of fast image processor.",
         )
         # Server warmups
@@ -1159,6 +1183,18 @@ class ServerArgs:
             default=ServerArgs.disaggregation_bootstrap_port,
             help="Bootstrap server port on the prefill server. Default is 8998.",
         )
+        parser.add_argument(
+            "--disaggregation-transfer-backend",
+            type=str,
+            default=ServerArgs.disaggregation_transfer_backend,
+            help="The backend for disaggregation transfer. Default is mooncake.",
+        )
+        parser.add_argument(
+            "--disaggregation-ib-device",
+            type=str,
+            default=ServerArgs.disaggregation_ib_device,
+            help="The ib device for disaggregation transfer. Default is None, it will be detected automatically if using the mooncake backend.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):

sglang/srt/speculative/eagle_draft_cuda_graph_runner.py CHANGED Viewed

@@ -84,10 +84,10 @@ class EAGLEDraftCudaGraphRunner:
             raise Exception(
                 f"Capture cuda graph failed: {e}\n"
                 "Possible solutions:\n"
-                "1. disable cuda graph by --disable-cuda-graph\n"
-                "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
-                "3. disable torch compile by not using --enable-torch-compile\n"
-                "4. specify --dtype to the same dtype (e.g. bfloat16)\n"
+                "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
+                "2. disable torch compile by not using --enable-torch-compile\n"
+                "3. specify --dtype to the same dtype (e.g. bfloat16)\n"
+                "4. disable cuda graph by --disable-cuda-graph\n"
                 "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
             )

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ from sglang.srt.managers.schedule_batch import (
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
 from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
-from sglang.srt.utils import is_cuda_available, is_hip, next_power_of_2
+from sglang.srt.utils import fast_topk, is_cuda_available, is_hip, next_power_of_2
 if is_cuda_available():
     from sgl_kernel import (
@@ -772,16 +772,6 @@ def select_top_k_tokens(
     return input_ids, hidden_states, scores, tree_info
-def fast_topk(values, topk, dim):
-    if topk == 1:
-        # Use max along the specified dimension to get both value and index
-        max_value, max_index = torch.max(values, dim=dim)
-        return max_value.unsqueeze(1), max_index.unsqueeze(1)
-    else:
-        # Use topk for efficiency with larger k values
-        return torch.topk(values, topk, dim=dim)
 def _generate_simulated_accept_index(
     accept_index,
     predict,

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -31,11 +31,15 @@ from sglang.srt.speculative.eagle_utils import (
     EagleVerifyInput,
     EagleVerifyOutput,
     assign_draft_cache_locs,
-    fast_topk,
     select_top_k_tokens,
 )
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
-from sglang.srt.utils import empty_context, get_available_gpu_memory, is_cuda_available
+from sglang.srt.utils import (
+    empty_context,
+    fast_topk,
+    get_available_gpu_memory,
+    is_cuda_available,
+)
 if is_cuda_available():
     from sgl_kernel import segment_packbits
@@ -267,14 +271,11 @@ class EAGLEWorker(TpModelWorker):
             )
         elif batch.forward_mode.is_idle():
             model_worker_batch = batch.get_model_worker_batch()
-            logits_output, next_token_ids, _ = (
-                self.target_worker.forward_batch_generation(
-                    ForwardBatch.init_new(
-                        model_worker_batch, self.target_worker.model_runner
-                    )
-                )
+            logits_output, next_token_ids = self.target_worker.forward_batch_generation(
+                model_worker_batch
             )
-            return logits_output, next_token_ids, model_worker_batch.bid, 0, False
+            return logits_output, next_token_ids, model_worker_batch.bid, 0
         else:
             logits_output, next_token_ids, bid = self.forward_target_extend(batch)
             with self.draft_tp_context(self.draft_model_runner.tp_group):

sglang/srt/utils.py CHANGED Viewed

@@ -16,6 +16,7 @@ import base64
 import builtins
 import ctypes
 import dataclasses
+import importlib
 import io
 import ipaddress
 import itertools
@@ -54,7 +55,6 @@ import torch.distributed
 import torch.distributed as dist
 import triton
 import zmq
-from decord import VideoReader, cpu
 from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
 from PIL import Image
@@ -127,7 +127,7 @@ def is_flashinfer_available():
     """
     if not get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
         return False
-    return is_cuda()
+    return importlib.util.find_spec("flashinfer") is not None and is_cuda()
 def is_cuda_available():
@@ -544,6 +544,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
 def encode_video(video_path, frame_count_limit=None):
+    # Lazy import because decord is not available on some arm platforms.
+    from decord import VideoReader, cpu
     if not os.path.exists(video_path):
         logger.error(f"Video {video_path} does not exist")
         return []
@@ -568,7 +571,7 @@ def encode_video(video_path, frame_count_limit=None):
 def load_image(
-    image_file: Union[Image.Image, str, bytes]
+    image_file: Union[Image.Image, str, bytes],
 ) -> tuple[Image.Image, tuple[int, int]]:
     image = image_size = None
     if isinstance(image_file, Image.Image):
@@ -845,33 +848,38 @@ def broadcast_pyobj(
     rank: int,
     dist_group: Optional[torch.distributed.ProcessGroup] = None,
     src: int = 0,
+    force_cpu_device: bool = True,
 ):
     """Broadcast inputs from rank=0 to all other ranks with torch.dist backend."""
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
+    )
     if rank == 0:
         if len(data) == 0:
-            tensor_size = torch.tensor([0], dtype=torch.long)
+            tensor_size = torch.tensor([0], dtype=torch.long, device=device)
             dist.broadcast(tensor_size, src=src, group=dist_group)
         else:
             serialized_data = pickle.dumps(data)
             size = len(serialized_data)
             tensor_data = torch.ByteTensor(
                 np.frombuffer(serialized_data, dtype=np.uint8)
-            )
-            tensor_size = torch.tensor([size], dtype=torch.long)
+            ).to(device)
+            tensor_size = torch.tensor([size], dtype=torch.long, device=device)
             dist.broadcast(tensor_size, src=src, group=dist_group)
             dist.broadcast(tensor_data, src=src, group=dist_group)
         return data
     else:
-        tensor_size = torch.tensor([0], dtype=torch.long)
+        tensor_size = torch.tensor([0], dtype=torch.long, device=device)
         dist.broadcast(tensor_size, src=src, group=dist_group)
         size = tensor_size.item()
         if size == 0:
             return []
-        tensor_data = torch.empty(size, dtype=torch.uint8)
+        tensor_data = torch.empty(size, dtype=torch.uint8, device=device)
         dist.broadcast(tensor_data, src=src, group=dist_group)
         serialized_data = bytes(tensor_data.cpu().numpy())
@@ -1480,14 +1488,43 @@ def permute_weight(x: torch.Tensor) -> torch.Tensor:
 class MultiprocessingSerializer:
     @staticmethod
-    def serialize(obj):
+    def serialize(obj, output_str: bool = False):
+        """
+        Serialize a Python object using ForkingPickler.
+        Args:
+            obj: The object to serialize.
+            output_str (bool): If True, return a base64-encoded string instead of raw bytes.
+        Returns:
+            bytes or str: The serialized object.
+        """
         buf = io.BytesIO()
         ForkingPickler(buf).dump(obj)
         buf.seek(0)
-        return buf.read()
+        output = buf.read()
+        if output_str:
+            # Convert bytes to base64-encoded string
+            output = base64.b64encode(output).decode("utf-8")
+        return output
     @staticmethod
     def deserialize(data):
+        """
+        Deserialize a previously serialized object.
+        Args:
+            data (bytes or str): The serialized data, optionally base64-encoded.
+        Returns:
+            The deserialized Python object.
+        """
+        if isinstance(data, str):
+            # Decode base64 string to bytes
+            data = base64.b64decode(data)
         return ForkingPickler.loads(data)
@@ -1819,3 +1856,92 @@ class DeepEPMode(Enum):
             return DeepEPMode.low_latency
         else:
             return DeepEPMode.normal
+def fast_topk(values, topk, dim):
+    if topk == 1:
+        # Use max along the specified dimension to get both value and index
+        return torch.max(values, dim=dim, keepdim=True)
+    else:
+        # Use topk for efficiency with larger k values
+        return torch.topk(values, topk, dim=dim)
+def is_hopper_with_cuda_12_3():
+    if not is_cuda():
+        return False
+    is_hopper = torch.cuda.get_device_capability()[0] == 9
+    cuda_version = torch.version.cuda.split(".")
+    is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3
+    return is_hopper and is_cuda_compatible
+def get_free_port():
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+def get_local_ip_by_remote() -> str:
+    # try ipv4
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        pass
+    # try ipv6
+    try:
+        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
+        # Google's public DNS server, see
+        # https://developers.google.com/speed/public-dns/docs/using#addresses
+        s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        raise ValueError(f"Can not get local ip")
+def is_page_size_one(server_args):
+    return server_args.page_size == 1
+def is_no_spec_infer_or_topk_one(server_args):
+    return server_args.speculative_eagle_topk is None or (
+        server_args.speculative_eagle_topk is not None
+        and server_args.speculative_eagle_topk == 1
+        and is_page_size_one(server_args)
+    )
+def is_fa3_default_architecture(hf_config):
+    architectures = getattr(hf_config, "architectures", None)
+    if not isinstance(architectures, list) or not architectures:
+        return False
+    default_archs = {
+        "Qwen2ForCausalLM",
+        "Llama4ForConditionalGeneration",
+        "LlamaForCausalLM",
+        "MistralForCausalLM",
+    }
+    return architectures[0] in default_archs
+# Can be more general if it is used in multiple places (keep it simple and thus not general now)
+class BumpAllocator:
+    def __init__(self, buffer_size: int, dtype, device):
+        self._buffer = torch.zeros((buffer_size,), dtype=dtype, device=device)
+        self._pointer = 0
+    def allocate(self, size: int):
+        assert self._pointer + size <= len(self._buffer)
+        output = self._buffer[self._pointer : self._pointer + size]
+        self._pointer += size
+        return output

sglang 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl

sglang 0.4.5py3-none-any.whl → 0.4.5.post2py3-none-any.whl