PyPI - sglang - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +73 -14
sglang/compile_deep_gemm.py +13 -7
sglang/launch_server.py +2 -0
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/compilation/backend.py +1 -1
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +30 -7
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -12
sglang/srt/entrypoints/engine.py +31 -20
sglang/srt/entrypoints/grpc_server.py +0 -1
sglang/srt/entrypoints/http_server.py +94 -94
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +23 -2
sglang/srt/eplb/expert_distribution.py +64 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/activation.py +6 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +19 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -1
sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
sglang/srt/layers/attention/utils.py +78 -0
sglang/srt/layers/communicator.py +24 -1
sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/layernorm.py +35 -6
sglang/srt/layers/logits_processor.py +9 -20
sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
sglang/srt/layers/moe/ep_moe/layer.py +78 -289
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +35 -10
sglang/srt/layers/moe/utils.py +3 -4
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +13 -84
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/awq.py +0 -3
sglang/srt/layers/quantization/base_config.py +7 -0
sglang/srt/layers/quantization/fp8.py +68 -63
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/quantization/mxfp4.py +30 -38
sglang/srt/layers/quantization/unquant.py +23 -45
sglang/srt/layers/quantization/w4afp8.py +38 -2
sglang/srt/layers/radix_attention.py +5 -2
sglang/srt/layers/rotary_embedding.py +130 -46
sglang/srt/layers/sampler.py +12 -1
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +29 -4
sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +185 -144
sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +165 -78
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/hicache_storage.py +7 -1
sglang/srt/mem_cache/memory_pool.py +253 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +55 -14
sglang/srt/model_executor/model_runner.py +77 -170
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +296 -78
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4_moe.py +322 -354
sglang/srt/models/glm4_moe_nextn.py +4 -14
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +29 -197
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/qwen2.py +23 -2
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +35 -5
sglang/srt/models/qwen3_moe.py +18 -12
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multimodal/processors/base_processor.py +1 -0
sglang/srt/multimodal/processors/glm4v.py +1 -1
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/parser/reasoning_parser.py +28 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +459 -199
sglang/srt/single_batch_overlap.py +2 -4
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +142 -74
sglang/srt/utils/hf_transformers_utils.py +38 -12
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_deterministic.py +235 -12
sglang/test/test_deterministic_utils.py +2 -1
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
sglang/srt/models/vila.py +0 -306
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/tracing/trace.py CHANGED Viewed

@@ -15,6 +15,8 @@
 from __future__ import annotations
+import base64
+import json
 import logging
 import os
 import random
@@ -24,6 +26,8 @@ import uuid
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from sglang.srt.utils import get_int_env_var
 if TYPE_CHECKING:
     from sglang.srt.managers.scheduler import Req
@@ -85,6 +89,8 @@ class SglangTraceReqContext:
     # Indicates whether this instance is a replica from the main process.
     # When True, root_span is None and only root_span_context is preserved.
     is_copy: bool = False
+    bootstrap_room_span: Optional[trace.span.Span] = None
+    bootstrap_room_span_context: Optional[context.Context] = None
     root_span: Optional[trace.span.Span] = None
     root_span_context: Optional[context.Context] = None
@@ -96,8 +102,7 @@ class SglangTracePropagateContext:
     def to_dict(self):
         carrier: dict[str, str] = {}
-        context.attach(self.root_span_context)
-        propagate.inject(carrier)
+        propagate.inject(carrier, self.root_span_context)
         if self.prev_span_context:
             return {
@@ -149,6 +154,7 @@ class SglangTraceCustomIdGenerator(id_generator.IdGenerator):
 # global variables
+remote_trace_contexts: Dict[str, SglangTracePropagateContext] = {}
 threads_info: Dict[int, SglangTraceThreadInfo] = {}
 reqs_context: Dict[str, SglangTraceReqContext] = {}
@@ -193,8 +199,17 @@ def process_tracing_init(otlp_endpoint, server_name):
             resource=resource, id_generator=SglangTraceCustomIdGenerator()
         )
+        schedule_delay_millis = get_int_env_var(
+            "SGLANG_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS", 500
+        )
+        max_export_batch_size = get_int_env_var(
+            "SGLANG_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE", 64
+        )
         processor = BatchSpanProcessor(
-            OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
+            OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True),
+            schedule_delay_millis=schedule_delay_millis,
+            max_export_batch_size=max_export_batch_size,
         )
         tracer_provider.add_span_processor(processor)
         trace.set_tracer_provider(tracer_provider)
@@ -266,7 +281,9 @@ def __create_thread_context(pid, req_span_context, ts: Optional[int] = None):
     return thread_context
-def trace_get_proc_propagate_context(rid) -> Optional[Dict[str, Any]]:
+def trace_get_proc_propagate_context(
+    rid, remote_propagate=False
+) -> Optional[Dict[str, Any]]:
     if not tracing_enabled:
         return None
@@ -283,9 +300,11 @@ def trace_get_proc_propagate_context(rid) -> Optional[Dict[str, Any]]:
     elif thread_context.last_span_context:
         prev_span_context = thread_context.last_span_context
-    trace_context = SglangTracePropagateContext(
-        reqs_context[rid].root_span_context, prev_span_context
-    )
+    root_span_context = reqs_context[rid].root_span_context
+    if remote_propagate:
+        root_span_context = reqs_context[rid].bootstrap_room_span_context
+    trace_context = SglangTracePropagateContext(root_span_context, prev_span_context)
     return trace_context.to_dict()
@@ -327,10 +346,54 @@ def trace_set_proc_propagate_context(rid, trace_context: Optional[Dict[str, Any]
     ].last_span_context = trace_context.prev_span_context
+def trace_get_remote_propagate_context(bootstrap_room_list: List[str]):
+    if not tracing_enabled:
+        return ""
+    reqs_trace_contexts = {}
+    for bootstrap_room in bootstrap_room_list:
+        # In the router, rid is also the bootstrap room.
+        bootstrap_room = str(bootstrap_room)
+        if bootstrap_room not in reqs_context:
+            continue
+        _context = trace_get_proc_propagate_context(
+            bootstrap_room, remote_propagate=True
+        )
+        reqs_trace_contexts[bootstrap_room] = _context
+    json_str = json.dumps(reqs_trace_contexts, ensure_ascii=False)
+    return base64.b64encode(json_str.encode("utf-8")).decode("utf-8")
+def trace_set_remote_propagate_context(base64_str):
+    if not tracing_enabled:
+        return
+    if base64_str is None or base64_str == "" or base64_str == "None":
+        return
+    base64_bytes = base64.b64decode(base64_str)
+    json_str = base64_bytes.decode("utf-8")
+    remote_reqs_trace_contexts = json.loads(json_str)
+    for bootstrap_room in remote_reqs_trace_contexts:
+        if bootstrap_room in remote_trace_contexts:
+            continue
+        remote_trace_contexts[bootstrap_room] = (
+            SglangTracePropagateContext.instance_from_dict(
+                remote_reqs_trace_contexts[bootstrap_room]
+            )
+        )
 def trace_req_start(
     rid: str,
     bootstrap_room: Optional[int] = None,
     ts: Optional[int] = None,
+    role: Optional[str] = "null",
 ):
     if not tracing_enabled:
         return
@@ -344,6 +407,7 @@ def trace_req_start(
         return
     # create req context and root span
+    bootstrap_room = 0 if bootstrap_room is None else bootstrap_room
     reqs_context[rid] = SglangTraceReqContext(
         rid=rid,
         start_time_ns=ts,
@@ -352,23 +416,42 @@ def trace_req_start(
         is_copy=False,
     )
+    # create bootstrap room span
+    tracer = threads_info[pid].tracer
+    if str(bootstrap_room) not in remote_trace_contexts:
+        attrs = {"bootstrap_room": str(hex(bootstrap_room))}
+        bootstrap_room_span = tracer.start_span(
+            name=f"Bootstrap Room {hex(bootstrap_room)}",
+            start_time=ts,
+            attributes=attrs,
+        )
+        reqs_context[rid].bootstrap_room_span = bootstrap_room_span
+        bootstrap_room_span_context = trace.set_span_in_context(bootstrap_room_span)
+    else:
+        bootstrap_room_span_context = remote_trace_contexts[
+            str(bootstrap_room)
+        ].root_span_context
     # Drop the worker_id added by MultiTokenizer
     orig_rid = rid.split("_")[-1]
-    tracer = threads_info[pid].tracer
+    role = "" if role == "null" else role
+    attrs = {"rid": orig_rid}
     root_span = tracer.start_span(
-        name=f"Req {orig_rid[:8]}",
+        name=f"{role} Req {orig_rid[:8]}",
         start_time=ts,
+        context=bootstrap_room_span_context,
+        attributes=attrs,
     )
     root_span.set_attributes(
         {
             "rid": rid,
-            "bootstrap_room": bootstrap_room if bootstrap_room else "None",
         }
     )
     reqs_context[rid].root_span = root_span
     reqs_context[rid].root_span_context = trace.set_span_in_context(root_span)
+    reqs_context[rid].bootstrap_room_span_context = bootstrap_room_span_context
     # create thread context and thread span
     reqs_context[rid].threads_context[pid] = __create_thread_context(
@@ -376,6 +459,10 @@ def trace_req_start(
         reqs_context[rid].root_span_context,
         ts,
     )
+    if str(bootstrap_room) in remote_trace_contexts:
+        reqs_context[rid].threads_context[pid].last_span_context = (
+            remote_trace_contexts[str(bootstrap_room)].prev_span_context
+        )
 def trace_req_finish(
@@ -399,6 +486,10 @@ def trace_req_finish(
         req_context.root_span.set_attributes(attrs)
     req_context.root_span.end(end_time=ts)
+    if str(req_context.bootstrap_room) in remote_trace_contexts:
+        del remote_trace_contexts[str(req_context.bootstrap_room)]
+    else:
+        req_context.bootstrap_room_span.end(end_time=ts)
     del reqs_context[rid]
@@ -518,7 +609,9 @@ trace_slice = trace_slice_end
 # Add event to the current slice on the same thread with the same rid.
-def trace_event(name: str, rid: str, ts: Optional[int] = None):
+def trace_event(
+    name: str, rid: str, ts: Optional[int] = None, attrs: Dict[str, Any] = None
+):
     if not tracing_enabled:
         return
@@ -539,7 +632,7 @@ def trace_event(name: str, rid: str, ts: Optional[int] = None):
     ts = ts or __get_cur_time_ns()
     slice_info = thread_context.cur_slice_stack[-1]
-    slice_info.span.add_event(name=name, timestamp=ts)
+    slice_info.span.add_event(name=name, timestamp=ts, attributes=attrs)
 # Add attrs to the current slice on the same thread with the same rid.
@@ -569,6 +662,9 @@ def trace_slice_batch(
     name: str,
     reqs: List[Req],
 ):
+    if not tracing_enabled:
+        return
     for req in reqs:
         trace_slice(
             name,
@@ -576,3 +672,16 @@ def trace_slice_batch(
             auto_next_anon=not req.finished(),
             thread_finish_flag=req.finished(),
         )
+def trace_event_batch(
+    name: str,
+    reqs: List[Req],
+    ts: Optional[int] = None,
+    attrs: Dict[str, Any] = None,
+):
+    if not tracing_enabled:
+        return
+    for req in reqs:
+        trace_event(name, req.rid, ts=ts, attrs=attrs)

sglang/srt/utils/common.py CHANGED Viewed

@@ -56,7 +56,6 @@ from json import JSONDecodeError
 from multiprocessing.reduction import ForkingPickler
 from pathlib import Path
 from typing import (
-    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -94,9 +93,6 @@ from typing_extensions import Literal
 from sglang.srt.environ import envs
 from sglang.srt.metrics.func_timer import enable_func_timer
-if TYPE_CHECKING:
-    from sglang.srt.layers.quantization.base_config import QuantizeMethodBase
 logger = logging.getLogger(__name__)
 show_time_cost = False
@@ -138,6 +134,7 @@ def is_xpu() -> bool:
     return hasattr(torch, "xpu") and torch.xpu.is_available()
+@lru_cache(maxsize=1)
 def is_npu() -> bool:
     return hasattr(torch, "npu") and torch.npu.is_available()
@@ -191,7 +188,16 @@ is_hopper_with_cuda_12_3 = lambda: _check(9)
 def is_blackwell():
     if not is_cuda():
         return False
-    return torch.cuda.get_device_capability()[0] == 10
+    return torch.cuda.get_device_capability()[0] in [10, 12]
+@lru_cache(maxsize=1)
+def is_blackwell_supported(device=None) -> bool:
+    if not is_cuda_alike():
+        return False
+    return (torch.cuda.get_device_capability(device)[0] in [10, 12]) and (
+        torch.version.cuda >= "12.8"
+    )
 @lru_cache(maxsize=1)
@@ -1069,32 +1075,6 @@ def monkey_patch_p2p_access_check():
     setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
-def monkey_patch_vllm_gguf_config():
-    try:
-        from vllm.model_executor.layers.quantization.gguf import (
-            GGUFConfig,
-            GGUFEmbeddingMethod,
-            GGUFLinearMethod,
-        )
-    except ImportError:
-        return
-    from sglang.srt.layers.linear import LinearBase
-    from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
-    def get_quant_method_with_embedding_replaced(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[QuantizeMethodBase]:
-        if isinstance(layer, LinearBase):
-            return GGUFLinearMethod(self)
-        elif isinstance(layer, VocabParallelEmbedding):
-            # patch to own VocabParallelEmbedding
-            return GGUFEmbeddingMethod(self)
-        return None
-    setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
 def set_ulimit(target_soft_limit=65535):
     # number of open files
     resource_type = resource.RLIMIT_NOFILE
@@ -1131,9 +1111,9 @@ def add_api_key_middleware(app, api_key: str):
     async def authentication(request, call_next):
         if request.method == "OPTIONS":
             return await call_next(request)
-        if request.url.path.startswith("/health"):
-            return await call_next(request)
-        if request.url.path.startswith("/metrics"):
+        if request.url.path.startswith("/health") or request.url.path.startswith(
+            "/metrics"
+        ):
             return await call_next(request)
         if request.headers.get("Authorization") != "Bearer " + api_key:
             return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
@@ -1259,42 +1239,34 @@ def point_to_point_pyobj(
     dst: int = 1,
 ):
     """Send data from src to dst in group using DeviceToDevice communication."""
+    device = torch.get_device_module().current_device()
     if rank == src:
         if len(data) == 0:
-            tensor_size = torch.tensor(
-                [0], dtype=torch.long, device=torch.cuda.current_device()
-            )
+            tensor_size = torch.tensor([0], dtype=torch.long, device=device)
             dist.send(tensor_size, dst=dst, group=group)
         else:
             serialized_data = pickle.dumps(data)
             size = len(serialized_data)
             tensor_data = torch.ByteTensor(
                 np.frombuffer(serialized_data, dtype=np.uint8)
-            ).cuda(
-                device=torch.cuda.current_device()
+            ).to(
+                device=device
             )  # Move to GPU
-            tensor_size = torch.tensor(
-                [size], dtype=torch.long, device=torch.cuda.current_device()
-            )
+            tensor_size = torch.tensor([size], dtype=torch.long, device=device)
             dist.send(tensor_size, dst=dst, group=group)
             dist.send(tensor_data, dst=dst, group=group)
         return data
     elif rank == dst:
-        tensor_size = torch.tensor(
-            [0], dtype=torch.long, device=torch.cuda.current_device()
-        )
+        tensor_size = torch.tensor([0], dtype=torch.long, device=device)
         dist.recv(tensor_size, src=src, group=group)
         size = tensor_size.item()
         if size == 0:
             return []
-        tensor_data = torch.empty(
-            size, dtype=torch.uint8, device=torch.cuda.current_device()
-        )
+        tensor_data = torch.empty(size, dtype=torch.uint8, device=device)
         dist.recv(tensor_data, src=src, group=group)
         serialized_data = bytes(
@@ -2106,7 +2078,7 @@ class MultiprocessingSerializer:
         if output_str:
             # Convert bytes to base64-encoded string
-            pybase64.b64encode(output).decode("utf-8")
+            output = pybase64.b64encode(output).decode("utf-8")
         return output
@@ -2125,7 +2097,78 @@ class MultiprocessingSerializer:
             # Decode base64 string to bytes
             data = pybase64.b64decode(data, validate=True)
-        return ForkingPickler.loads(data)
+        return SafeUnpickler(io.BytesIO(data)).load()
+class SafeUnpickler(pickle.Unpickler):
+    ALLOWED_MODULE_PREFIXES = {
+        # --- Python types ---
+        "builtins.",
+        "collections.",
+        "copyreg.",
+        "functools.",
+        "itertools.",
+        "operator.",
+        "types.",
+        "weakref.",
+        # --- PyTorch types ---
+        "torch.",
+        "torch._tensor.",
+        "torch.storage.",
+        "torch.nn.parameter.",
+        "torch.autograd.function.",
+        # --- torch distributed ---
+        "torch.distributed.",
+        "torch.distributed._shard.",
+        "torch.distributed._composable.",
+        "torch._C._distributed_c10d.",
+        "torch._C._distributed_fsdp.",
+        "torch.distributed.optim.",
+        # --- multiprocessing ---
+        "multiprocessing.resource_sharer.",
+        "multiprocessing.reduction.",
+        "pickletools.",
+        # --- PEFT / LoRA ---
+        "peft.",
+        "transformers.",
+        "huggingface_hub.",
+        # --- SGLang & Unitest ---
+        "sglang.srt.weight_sync.tensor_bucket.",
+        "sglang.srt.model_executor.model_runner.",
+        "sglang.srt.layers.",
+        "sglang.srt.utils.",
+    }
+    DENY_CLASSES = {
+        ("builtins", "eval"),
+        ("builtins", "exec"),
+        ("builtins", "compile"),
+        ("os", "system"),
+        ("subprocess", "Popen"),
+        ("subprocess", "run"),
+        ("codecs", "decode"),
+        ("types", "CodeType"),
+        ("types", "FunctionType"),
+    }
+    def find_class(self, module, name):
+        # Block deterministic attacks
+        if (module, name) in self.DENY_CLASSES:
+            raise RuntimeError(
+                f"Blocked unsafe class loading ({module}.{name}), "
+                f"to prevent exploitation of CVE-2025-10164"
+            )
+        # Allowlist of safe-to-load modules.
+        if any(
+            (module + ".").startswith(prefix) for prefix in self.ALLOWED_MODULE_PREFIXES
+        ):
+            return super().find_class(module, name)
+        # Block everything else. (Potential attack surface)
+        raise RuntimeError(
+            f"Blocked unsafe class loading ({module}.{name}), "
+            f"to prevent exploitation of CVE-2025-10164"
+        )
 def debug_timing(func):
@@ -2308,16 +2351,24 @@ def launch_dummy_health_check_server(host, port, enable_metrics):
     )
     server = uvicorn.Server(config=config)
-    try:
-        loop = asyncio.get_running_loop()
-        logger.info(
-            f"Dummy health check server scheduled on existing loop at {host}:{port}"
-        )
-        loop.create_task(server.serve())
+    # Run server in a background daemon thread with its own event loop
+    # This prevents blocking the main thread while still serving health checks
+    def run_server():
+        try:
+            asyncio.run(server.serve())
+        except Exception as e:
+            logger.error(f"Dummy health check server failed to start: {e}")
+            raise
+        finally:
+            logger.info(f"Dummy health check server stopped at {host}:{port}")
-    except RuntimeError:
-        logger.info(f"Starting dummy health check server at {host}:{port}")
-        server.run()
+    thread = threading.Thread(
+        target=run_server, daemon=True, name="health-check-server"
+    )
+    thread.start()
+    logger.info(
+        f"Dummy health check server started in background thread at {host}:{port}"
+    )
 def create_checksum(directory: str):
@@ -2578,17 +2629,12 @@ def get_local_ip_auto(fallback: str = None) -> str:
     raise ValueError("Can not get local ip")
-def is_page_size_one(server_args):
-    return server_args.page_size == 1
 # TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
 # TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
 def is_no_spec_infer_or_topk_one(server_args):
     return server_args.speculative_eagle_topk is None or (
-        server_args.speculative_eagle_topk is not None
-        and server_args.speculative_eagle_topk == 1
-        and is_page_size_one(server_args)
+        server_args.speculative_eagle_topk == 1
+        and (server_args.page_size == 1 or server_args.page_size is None)
     )
@@ -3068,12 +3114,16 @@ def apply_module_patch(target_module, target_function, wrappers):
         setattr(original_module, target_function, candidate)
     for key, value in sys.modules.copy().items():
-        if (
-            target_function is not None
-            and hasattr(value, target_function)
-            and id(getattr(value, target_function)) == original_function_id
-        ):
-            setattr(value, target_function, candidate)
+        try:
+            if (
+                target_function is not None
+                and hasattr(value, target_function)
+                and id(getattr(value, target_function)) == original_function_id
+            ):
+                setattr(value, target_function, candidate)
+        except ImportError as e:
+            # Ignore some modules reporting ImportError when calling hasattr
+            logger.warning(f"Ignore {value} reports ImportError with:\n{str(e)}")
 def parse_module_path(module_path, function_name, create_dummy):
@@ -3525,6 +3575,24 @@ def cached_triton_kernel(key_fn=None):
     """
     def decorator(fn):
-        return CachedKernel(fn, key_fn)
+        if envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.get():
+            logger.debug(
+                f"{envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.name} = True. Using custom triton kernel cache."
+            )
+            return CachedKernel(fn, key_fn)
+        else:
+            # Fallback to the native triton cache.
+            logger.debug(
+                f"{envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.name} = False. Using native triton kernel cache."
+            )
+            return fn
     return decorator
+# Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim

sglang/srt/utils/hf_transformers_utils.py CHANGED Viewed

@@ -43,6 +43,7 @@ from sglang.srt.configs import (
     DotsVLMConfig,
     ExaoneConfig,
     FalconH1Config,
+    KimiLinearConfig,
     KimiVLConfig,
     LongcatFlashConfig,
     MultiModalityConfig,
@@ -54,6 +55,7 @@ from sglang.srt.configs import (
 from sglang.srt.configs.deepseek_ocr import DeepseekVLV2Config
 from sglang.srt.configs.internvl import InternVLChatConfig
 from sglang.srt.connector import create_remote_connector
+from sglang.srt.multimodal.customized_mm_processor_utils import _CUSTOMIZED_MM_PROCESSOR
 from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
 _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
@@ -67,6 +69,7 @@ _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
     Step3VLConfig,
     LongcatFlashConfig,
     Olmo3Config,
+    KimiLinearConfig,
     Qwen3NextConfig,
     FalconH1Config,
     DotsVLMConfig,
@@ -172,6 +175,16 @@ def _load_deepseek_v32_model(
     )
+def _is_deepseek_ocr_model(config: PretrainedConfig) -> bool:
+    # TODO: Remove this workaround related when AutoConfig correctly identifies deepseek-ocr.
+    # Hugging Face's AutoConfig currently misidentifies it as deepseekvl2.
+    return (
+        getattr(config, "auto_map", None) is not None
+        and config.auto_map.get("AutoModel")
+        == "modeling_deepseekocr.DeepseekOCRForCausalLM"
+    )
 @lru_cache_frozenset(maxsize=32)
 def get_config(
     model: str,
@@ -197,10 +210,6 @@ def get_config(
         config = AutoConfig.from_pretrained(
             model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
         )
-        if "deepseek-ai/DeepSeek-OCR" in model:
-            config.model_type = "deepseek-ocr"
-            # Due to an unknown reason, Hugging Face’s AutoConfig mistakenly recognizes the configuration of deepseek-ocr as deepseekvl2.
-            # This is a temporary workaround and will require further optimization.
     except ValueError as e:
         if not "deepseek_v32" in str(e):
@@ -237,7 +246,11 @@ def get_config(
                 setattr(config, key, val)
     if config.model_type in _CONFIG_REGISTRY:
-        config_class = _CONFIG_REGISTRY[config.model_type]
+        model_type = config.model_type
+        if model_type == "deepseek_vl_v2":
+            if _is_deepseek_ocr_model(config):
+                model_type = "deepseek-ocr"
+        config_class = _CONFIG_REGISTRY[model_type]
         config = config_class.from_pretrained(model, revision=revision)
         # NOTE(HandH1998): Qwen2VL requires `_name_or_path` attribute in `config`.
         setattr(config, "_name_or_path", model)
@@ -441,6 +454,10 @@ def get_processor(
         **kwargs,
     )
+    if _is_deepseek_ocr_model(config):
+        # Temporary hack for load deepseek-ocr
+        config.model_type = "deepseek-ocr"
     # fix: for Qwen2-VL and Sarashina2Vision models, inject default 'size' if not provided.
     if config.model_type in {"qwen2_vl", "sarashina2_vision"}:
         if "size" not in kwargs:
@@ -458,13 +475,22 @@ def get_processor(
                 **kwargs,
             )
         else:
-            processor = AutoProcessor.from_pretrained(
-                tokenizer_name,
-                *args,
-                trust_remote_code=trust_remote_code,
-                revision=revision,
-                **kwargs,
-            )
+            if config.model_type in _CUSTOMIZED_MM_PROCESSOR:
+                processor = _CUSTOMIZED_MM_PROCESSOR[config.model_type].from_pretrained(
+                    tokenizer_name,
+                    *args,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    **kwargs,
+                )
+            else:
+                processor = AutoProcessor.from_pretrained(
+                    tokenizer_name,
+                    *args,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    **kwargs,
+                )
     except ValueError as e:
         error_message = str(e)

sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl