sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +149 -34
 - sglang/bench_serving.py +73 -14
 - sglang/compile_deep_gemm.py +13 -7
 - sglang/launch_server.py +2 -0
 - sglang/srt/batch_invariant_ops/__init__.py +2 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
 - sglang/srt/checkpoint_engine/__init__.py +9 -0
 - sglang/srt/checkpoint_engine/update.py +317 -0
 - sglang/srt/compilation/backend.py +1 -1
 - sglang/srt/configs/__init__.py +2 -0
 - sglang/srt/configs/deepseek_ocr.py +542 -10
 - sglang/srt/configs/deepseekvl2.py +95 -194
 - sglang/srt/configs/kimi_linear.py +160 -0
 - sglang/srt/configs/mamba_utils.py +66 -0
 - sglang/srt/configs/model_config.py +30 -7
 - sglang/srt/constants.py +7 -0
 - sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
 - sglang/srt/disaggregation/decode.py +34 -6
 - sglang/srt/disaggregation/nixl/conn.py +2 -2
 - sglang/srt/disaggregation/prefill.py +25 -3
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
 - sglang/srt/distributed/parallel_state.py +9 -12
 - sglang/srt/entrypoints/engine.py +31 -20
 - sglang/srt/entrypoints/grpc_server.py +0 -1
 - sglang/srt/entrypoints/http_server.py +94 -94
 - sglang/srt/entrypoints/openai/protocol.py +7 -1
 - sglang/srt/entrypoints/openai/serving_chat.py +42 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +10 -0
 - sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
 - sglang/srt/environ.py +23 -2
 - sglang/srt/eplb/expert_distribution.py +64 -1
 - sglang/srt/eplb/expert_location.py +106 -36
 - sglang/srt/function_call/function_call_parser.py +2 -0
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/grpc/compile_proto.py +3 -0
 - sglang/srt/layers/activation.py +6 -0
 - sglang/srt/layers/attention/ascend_backend.py +233 -5
 - sglang/srt/layers/attention/attention_registry.py +3 -0
 - sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
 - sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
 - sglang/srt/layers/attention/fla/kda.py +1359 -0
 - sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
 - sglang/srt/layers/attention/flashattention_backend.py +19 -8
 - sglang/srt/layers/attention/flashinfer_backend.py +10 -1
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
 - sglang/srt/layers/attention/flashmla_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
 - sglang/srt/layers/attention/mamba/mamba.py +20 -11
 - sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
 - sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
 - sglang/srt/layers/attention/nsa/transform_index.py +1 -1
 - sglang/srt/layers/attention/nsa_backend.py +157 -23
 - sglang/srt/layers/attention/triton_backend.py +4 -1
 - sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
 - sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
 - sglang/srt/layers/attention/utils.py +78 -0
 - sglang/srt/layers/communicator.py +24 -1
 - sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
 - sglang/srt/layers/layernorm.py +35 -6
 - sglang/srt/layers/logits_processor.py +9 -20
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
 - sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
 - sglang/srt/layers/moe/ep_moe/layer.py +78 -289
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
 - sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
 - sglang/srt/layers/moe/topk.py +35 -10
 - sglang/srt/layers/moe/utils.py +3 -4
 - sglang/srt/layers/pooler.py +21 -2
 - sglang/srt/layers/quantization/__init__.py +13 -84
 - sglang/srt/layers/quantization/auto_round.py +394 -0
 - sglang/srt/layers/quantization/awq.py +0 -3
 - sglang/srt/layers/quantization/base_config.py +7 -0
 - sglang/srt/layers/quantization/fp8.py +68 -63
 - sglang/srt/layers/quantization/fp8_kernel.py +1 -1
 - sglang/srt/layers/quantization/fp8_utils.py +2 -2
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +168 -11
 - sglang/srt/layers/quantization/mxfp4.py +30 -38
 - sglang/srt/layers/quantization/unquant.py +23 -45
 - sglang/srt/layers/quantization/w4afp8.py +38 -2
 - sglang/srt/layers/radix_attention.py +5 -2
 - sglang/srt/layers/rotary_embedding.py +130 -46
 - sglang/srt/layers/sampler.py +12 -1
 - sglang/srt/lora/lora_registry.py +9 -0
 - sglang/srt/managers/async_mm_data_processor.py +122 -0
 - sglang/srt/managers/data_parallel_controller.py +30 -3
 - sglang/srt/managers/detokenizer_manager.py +3 -0
 - sglang/srt/managers/io_struct.py +29 -4
 - sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
 - sglang/srt/managers/schedule_batch.py +74 -15
 - sglang/srt/managers/scheduler.py +185 -144
 - sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
 - sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
 - sglang/srt/managers/scheduler_pp_mixin.py +7 -2
 - sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
 - sglang/srt/managers/session_controller.py +6 -5
 - sglang/srt/managers/tokenizer_manager.py +165 -78
 - sglang/srt/managers/tp_worker.py +24 -1
 - sglang/srt/mem_cache/base_prefix_cache.py +23 -4
 - sglang/srt/mem_cache/common.py +1 -0
 - sglang/srt/mem_cache/hicache_storage.py +7 -1
 - sglang/srt/mem_cache/memory_pool.py +253 -57
 - sglang/srt/mem_cache/memory_pool_host.py +12 -5
 - sglang/srt/mem_cache/radix_cache.py +4 -0
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
 - sglang/srt/metrics/collector.py +46 -3
 - sglang/srt/model_executor/cuda_graph_runner.py +15 -3
 - sglang/srt/model_executor/forward_batch_info.py +55 -14
 - sglang/srt/model_executor/model_runner.py +77 -170
 - sglang/srt/model_executor/npu_graph_runner.py +7 -3
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
 - sglang/srt/model_loader/weight_utils.py +1 -1
 - sglang/srt/models/bailing_moe.py +9 -2
 - sglang/srt/models/deepseek_nextn.py +11 -2
 - sglang/srt/models/deepseek_v2.py +296 -78
 - sglang/srt/models/glm4.py +391 -77
 - sglang/srt/models/glm4_moe.py +322 -354
 - sglang/srt/models/glm4_moe_nextn.py +4 -14
 - sglang/srt/models/glm4v.py +196 -55
 - sglang/srt/models/glm4v_moe.py +29 -197
 - sglang/srt/models/gpt_oss.py +1 -10
 - sglang/srt/models/kimi_linear.py +678 -0
 - sglang/srt/models/llama4.py +1 -1
 - sglang/srt/models/llama_eagle3.py +11 -1
 - sglang/srt/models/longcat_flash.py +2 -2
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/qwen2.py +23 -2
 - sglang/srt/models/qwen2_moe.py +30 -15
 - sglang/srt/models/qwen3.py +35 -5
 - sglang/srt/models/qwen3_moe.py +18 -12
 - sglang/srt/models/qwen3_next.py +7 -0
 - sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
 - sglang/srt/multimodal/processors/base_processor.py +1 -0
 - sglang/srt/multimodal/processors/glm4v.py +1 -1
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
 - sglang/srt/multiplex/multiplexing_mixin.py +209 -0
 - sglang/srt/multiplex/pdmux_context.py +164 -0
 - sglang/srt/parser/conversation.py +7 -1
 - sglang/srt/parser/reasoning_parser.py +28 -1
 - sglang/srt/sampling/custom_logit_processor.py +67 -1
 - sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
 - sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
 - sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
 - sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
 - sglang/srt/server_args.py +459 -199
 - sglang/srt/single_batch_overlap.py +2 -4
 - sglang/srt/speculative/draft_utils.py +16 -0
 - sglang/srt/speculative/eagle_info.py +42 -36
 - sglang/srt/speculative/eagle_info_v2.py +68 -25
 - sglang/srt/speculative/eagle_utils.py +261 -16
 - sglang/srt/speculative/eagle_worker.py +11 -3
 - sglang/srt/speculative/eagle_worker_v2.py +15 -9
 - sglang/srt/speculative/spec_info.py +305 -31
 - sglang/srt/speculative/spec_utils.py +44 -8
 - sglang/srt/tracing/trace.py +121 -12
 - sglang/srt/utils/common.py +142 -74
 - sglang/srt/utils/hf_transformers_utils.py +38 -12
 - sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
 - sglang/test/kits/radix_cache_server_kit.py +50 -0
 - sglang/test/runners.py +31 -7
 - sglang/test/simple_eval_common.py +5 -3
 - sglang/test/simple_eval_humaneval.py +1 -0
 - sglang/test/simple_eval_math.py +1 -0
 - sglang/test/simple_eval_mmlu.py +1 -0
 - sglang/test/simple_eval_mmmu_vlm.py +1 -0
 - sglang/test/test_deterministic.py +235 -12
 - sglang/test/test_deterministic_utils.py +2 -1
 - sglang/test/test_utils.py +7 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
 - sglang/srt/models/vila.py +0 -306
 - /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
 
    
        sglang/srt/tracing/trace.py
    CHANGED
    
    | 
         @@ -15,6 +15,8 @@ 
     | 
|
| 
       15 
15 
     | 
    
         | 
| 
       16 
16 
     | 
    
         
             
            from __future__ import annotations
         
     | 
| 
       17 
17 
     | 
    
         | 
| 
      
 18 
     | 
    
         
            +
            import base64
         
     | 
| 
      
 19 
     | 
    
         
            +
            import json
         
     | 
| 
       18 
20 
     | 
    
         
             
            import logging
         
     | 
| 
       19 
21 
     | 
    
         
             
            import os
         
     | 
| 
       20 
22 
     | 
    
         
             
            import random
         
     | 
| 
         @@ -24,6 +26,8 @@ import uuid 
     | 
|
| 
       24 
26 
     | 
    
         
             
            from dataclasses import dataclass
         
     | 
| 
       25 
27 
     | 
    
         
             
            from typing import TYPE_CHECKING, Any, Dict, List, Optional
         
     | 
| 
       26 
28 
     | 
    
         | 
| 
      
 29 
     | 
    
         
            +
            from sglang.srt.utils import get_int_env_var
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
       27 
31 
     | 
    
         
             
            if TYPE_CHECKING:
         
     | 
| 
       28 
32 
     | 
    
         
             
                from sglang.srt.managers.scheduler import Req
         
     | 
| 
       29 
33 
     | 
    
         | 
| 
         @@ -85,6 +89,8 @@ class SglangTraceReqContext: 
     | 
|
| 
       85 
89 
     | 
    
         
             
                # Indicates whether this instance is a replica from the main process.
         
     | 
| 
       86 
90 
     | 
    
         
             
                # When True, root_span is None and only root_span_context is preserved.
         
     | 
| 
       87 
91 
     | 
    
         
             
                is_copy: bool = False
         
     | 
| 
      
 92 
     | 
    
         
            +
                bootstrap_room_span: Optional[trace.span.Span] = None
         
     | 
| 
      
 93 
     | 
    
         
            +
                bootstrap_room_span_context: Optional[context.Context] = None
         
     | 
| 
       88 
94 
     | 
    
         
             
                root_span: Optional[trace.span.Span] = None
         
     | 
| 
       89 
95 
     | 
    
         
             
                root_span_context: Optional[context.Context] = None
         
     | 
| 
       90 
96 
     | 
    
         | 
| 
         @@ -96,8 +102,7 @@ class SglangTracePropagateContext: 
     | 
|
| 
       96 
102 
     | 
    
         | 
| 
       97 
103 
     | 
    
         
             
                def to_dict(self):
         
     | 
| 
       98 
104 
     | 
    
         
             
                    carrier: dict[str, str] = {}
         
     | 
| 
       99 
     | 
    
         
            -
                     
     | 
| 
       100 
     | 
    
         
            -
                    propagate.inject(carrier)
         
     | 
| 
      
 105 
     | 
    
         
            +
                    propagate.inject(carrier, self.root_span_context)
         
     | 
| 
       101 
106 
     | 
    
         | 
| 
       102 
107 
     | 
    
         
             
                    if self.prev_span_context:
         
     | 
| 
       103 
108 
     | 
    
         
             
                        return {
         
     | 
| 
         @@ -149,6 +154,7 @@ class SglangTraceCustomIdGenerator(id_generator.IdGenerator): 
     | 
|
| 
       149 
154 
     | 
    
         | 
| 
       150 
155 
     | 
    
         | 
| 
       151 
156 
     | 
    
         
             
            # global variables
         
     | 
| 
      
 157 
     | 
    
         
            +
            remote_trace_contexts: Dict[str, SglangTracePropagateContext] = {}
         
     | 
| 
       152 
158 
     | 
    
         
             
            threads_info: Dict[int, SglangTraceThreadInfo] = {}
         
     | 
| 
       153 
159 
     | 
    
         
             
            reqs_context: Dict[str, SglangTraceReqContext] = {}
         
     | 
| 
       154 
160 
     | 
    
         | 
| 
         @@ -193,8 +199,17 @@ def process_tracing_init(otlp_endpoint, server_name): 
     | 
|
| 
       193 
199 
     | 
    
         
             
                        resource=resource, id_generator=SglangTraceCustomIdGenerator()
         
     | 
| 
       194 
200 
     | 
    
         
             
                    )
         
     | 
| 
       195 
201 
     | 
    
         | 
| 
      
 202 
     | 
    
         
            +
                    schedule_delay_millis = get_int_env_var(
         
     | 
| 
      
 203 
     | 
    
         
            +
                        "SGLANG_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS", 500
         
     | 
| 
      
 204 
     | 
    
         
            +
                    )
         
     | 
| 
      
 205 
     | 
    
         
            +
                    max_export_batch_size = get_int_env_var(
         
     | 
| 
      
 206 
     | 
    
         
            +
                        "SGLANG_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE", 64
         
     | 
| 
      
 207 
     | 
    
         
            +
                    )
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
       196 
209 
     | 
    
         
             
                    processor = BatchSpanProcessor(
         
     | 
| 
       197 
     | 
    
         
            -
                        OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
         
     | 
| 
      
 210 
     | 
    
         
            +
                        OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True),
         
     | 
| 
      
 211 
     | 
    
         
            +
                        schedule_delay_millis=schedule_delay_millis,
         
     | 
| 
      
 212 
     | 
    
         
            +
                        max_export_batch_size=max_export_batch_size,
         
     | 
| 
       198 
213 
     | 
    
         
             
                    )
         
     | 
| 
       199 
214 
     | 
    
         
             
                    tracer_provider.add_span_processor(processor)
         
     | 
| 
       200 
215 
     | 
    
         
             
                    trace.set_tracer_provider(tracer_provider)
         
     | 
| 
         @@ -266,7 +281,9 @@ def __create_thread_context(pid, req_span_context, ts: Optional[int] = None): 
     | 
|
| 
       266 
281 
     | 
    
         
             
                return thread_context
         
     | 
| 
       267 
282 
     | 
    
         | 
| 
       268 
283 
     | 
    
         | 
| 
       269 
     | 
    
         
            -
            def trace_get_proc_propagate_context( 
     | 
| 
      
 284 
     | 
    
         
            +
            def trace_get_proc_propagate_context(
         
     | 
| 
      
 285 
     | 
    
         
            +
                rid, remote_propagate=False
         
     | 
| 
      
 286 
     | 
    
         
            +
            ) -> Optional[Dict[str, Any]]:
         
     | 
| 
       270 
287 
     | 
    
         
             
                if not tracing_enabled:
         
     | 
| 
       271 
288 
     | 
    
         
             
                    return None
         
     | 
| 
       272 
289 
     | 
    
         | 
| 
         @@ -283,9 +300,11 @@ def trace_get_proc_propagate_context(rid) -> Optional[Dict[str, Any]]: 
     | 
|
| 
       283 
300 
     | 
    
         
             
                elif thread_context.last_span_context:
         
     | 
| 
       284 
301 
     | 
    
         
             
                    prev_span_context = thread_context.last_span_context
         
     | 
| 
       285 
302 
     | 
    
         | 
| 
       286 
     | 
    
         
            -
                 
     | 
| 
       287 
     | 
    
         
            -
             
     | 
| 
       288 
     | 
    
         
            -
             
     | 
| 
      
 303 
     | 
    
         
            +
                root_span_context = reqs_context[rid].root_span_context
         
     | 
| 
      
 304 
     | 
    
         
            +
                if remote_propagate:
         
     | 
| 
      
 305 
     | 
    
         
            +
                    root_span_context = reqs_context[rid].bootstrap_room_span_context
         
     | 
| 
      
 306 
     | 
    
         
            +
             
     | 
| 
      
 307 
     | 
    
         
            +
                trace_context = SglangTracePropagateContext(root_span_context, prev_span_context)
         
     | 
| 
       289 
308 
     | 
    
         
             
                return trace_context.to_dict()
         
     | 
| 
       290 
309 
     | 
    
         | 
| 
       291 
310 
     | 
    
         | 
| 
         @@ -327,10 +346,54 @@ def trace_set_proc_propagate_context(rid, trace_context: Optional[Dict[str, Any] 
     | 
|
| 
       327 
346 
     | 
    
         
             
                ].last_span_context = trace_context.prev_span_context
         
     | 
| 
       328 
347 
     | 
    
         | 
| 
       329 
348 
     | 
    
         | 
| 
      
 349 
     | 
    
         
            +
            def trace_get_remote_propagate_context(bootstrap_room_list: List[str]):
         
     | 
| 
      
 350 
     | 
    
         
            +
                if not tracing_enabled:
         
     | 
| 
      
 351 
     | 
    
         
            +
                    return ""
         
     | 
| 
      
 352 
     | 
    
         
            +
             
     | 
| 
      
 353 
     | 
    
         
            +
                reqs_trace_contexts = {}
         
     | 
| 
      
 354 
     | 
    
         
            +
                for bootstrap_room in bootstrap_room_list:
         
     | 
| 
      
 355 
     | 
    
         
            +
                    # In the router, rid is also the bootstrap room.
         
     | 
| 
      
 356 
     | 
    
         
            +
                    bootstrap_room = str(bootstrap_room)
         
     | 
| 
      
 357 
     | 
    
         
            +
             
     | 
| 
      
 358 
     | 
    
         
            +
                    if bootstrap_room not in reqs_context:
         
     | 
| 
      
 359 
     | 
    
         
            +
                        continue
         
     | 
| 
      
 360 
     | 
    
         
            +
             
     | 
| 
      
 361 
     | 
    
         
            +
                    _context = trace_get_proc_propagate_context(
         
     | 
| 
      
 362 
     | 
    
         
            +
                        bootstrap_room, remote_propagate=True
         
     | 
| 
      
 363 
     | 
    
         
            +
                    )
         
     | 
| 
      
 364 
     | 
    
         
            +
                    reqs_trace_contexts[bootstrap_room] = _context
         
     | 
| 
      
 365 
     | 
    
         
            +
             
     | 
| 
      
 366 
     | 
    
         
            +
                json_str = json.dumps(reqs_trace_contexts, ensure_ascii=False)
         
     | 
| 
      
 367 
     | 
    
         
            +
                return base64.b64encode(json_str.encode("utf-8")).decode("utf-8")
         
     | 
| 
      
 368 
     | 
    
         
            +
             
     | 
| 
      
 369 
     | 
    
         
            +
             
     | 
| 
      
 370 
     | 
    
         
            +
            def trace_set_remote_propagate_context(base64_str):
         
     | 
| 
      
 371 
     | 
    
         
            +
                if not tracing_enabled:
         
     | 
| 
      
 372 
     | 
    
         
            +
                    return
         
     | 
| 
      
 373 
     | 
    
         
            +
             
     | 
| 
      
 374 
     | 
    
         
            +
                if base64_str is None or base64_str == "" or base64_str == "None":
         
     | 
| 
      
 375 
     | 
    
         
            +
                    return
         
     | 
| 
      
 376 
     | 
    
         
            +
             
     | 
| 
      
 377 
     | 
    
         
            +
                base64_bytes = base64.b64decode(base64_str)
         
     | 
| 
      
 378 
     | 
    
         
            +
                json_str = base64_bytes.decode("utf-8")
         
     | 
| 
      
 379 
     | 
    
         
            +
                remote_reqs_trace_contexts = json.loads(json_str)
         
     | 
| 
      
 380 
     | 
    
         
            +
             
     | 
| 
      
 381 
     | 
    
         
            +
                for bootstrap_room in remote_reqs_trace_contexts:
         
     | 
| 
      
 382 
     | 
    
         
            +
                    if bootstrap_room in remote_trace_contexts:
         
     | 
| 
      
 383 
     | 
    
         
            +
                        continue
         
     | 
| 
      
 384 
     | 
    
         
            +
             
     | 
| 
      
 385 
     | 
    
         
            +
                    remote_trace_contexts[bootstrap_room] = (
         
     | 
| 
      
 386 
     | 
    
         
            +
                        SglangTracePropagateContext.instance_from_dict(
         
     | 
| 
      
 387 
     | 
    
         
            +
                            remote_reqs_trace_contexts[bootstrap_room]
         
     | 
| 
      
 388 
     | 
    
         
            +
                        )
         
     | 
| 
      
 389 
     | 
    
         
            +
                    )
         
     | 
| 
      
 390 
     | 
    
         
            +
             
     | 
| 
      
 391 
     | 
    
         
            +
             
     | 
| 
       330 
392 
     | 
    
         
             
            def trace_req_start(
         
     | 
| 
       331 
393 
     | 
    
         
             
                rid: str,
         
     | 
| 
       332 
394 
     | 
    
         
             
                bootstrap_room: Optional[int] = None,
         
     | 
| 
       333 
395 
     | 
    
         
             
                ts: Optional[int] = None,
         
     | 
| 
      
 396 
     | 
    
         
            +
                role: Optional[str] = "null",
         
     | 
| 
       334 
397 
     | 
    
         
             
            ):
         
     | 
| 
       335 
398 
     | 
    
         
             
                if not tracing_enabled:
         
     | 
| 
       336 
399 
     | 
    
         
             
                    return
         
     | 
| 
         @@ -344,6 +407,7 @@ def trace_req_start( 
     | 
|
| 
       344 
407 
     | 
    
         
             
                    return
         
     | 
| 
       345 
408 
     | 
    
         | 
| 
       346 
409 
     | 
    
         
             
                # create req context and root span
         
     | 
| 
      
 410 
     | 
    
         
            +
                bootstrap_room = 0 if bootstrap_room is None else bootstrap_room
         
     | 
| 
       347 
411 
     | 
    
         
             
                reqs_context[rid] = SglangTraceReqContext(
         
     | 
| 
       348 
412 
     | 
    
         
             
                    rid=rid,
         
     | 
| 
       349 
413 
     | 
    
         
             
                    start_time_ns=ts,
         
     | 
| 
         @@ -352,23 +416,42 @@ def trace_req_start( 
     | 
|
| 
       352 
416 
     | 
    
         
             
                    is_copy=False,
         
     | 
| 
       353 
417 
     | 
    
         
             
                )
         
     | 
| 
       354 
418 
     | 
    
         | 
| 
      
 419 
     | 
    
         
            +
                # create bootstrap room span
         
     | 
| 
      
 420 
     | 
    
         
            +
                tracer = threads_info[pid].tracer
         
     | 
| 
      
 421 
     | 
    
         
            +
                if str(bootstrap_room) not in remote_trace_contexts:
         
     | 
| 
      
 422 
     | 
    
         
            +
                    attrs = {"bootstrap_room": str(hex(bootstrap_room))}
         
     | 
| 
      
 423 
     | 
    
         
            +
                    bootstrap_room_span = tracer.start_span(
         
     | 
| 
      
 424 
     | 
    
         
            +
                        name=f"Bootstrap Room {hex(bootstrap_room)}",
         
     | 
| 
      
 425 
     | 
    
         
            +
                        start_time=ts,
         
     | 
| 
      
 426 
     | 
    
         
            +
                        attributes=attrs,
         
     | 
| 
      
 427 
     | 
    
         
            +
                    )
         
     | 
| 
      
 428 
     | 
    
         
            +
                    reqs_context[rid].bootstrap_room_span = bootstrap_room_span
         
     | 
| 
      
 429 
     | 
    
         
            +
                    bootstrap_room_span_context = trace.set_span_in_context(bootstrap_room_span)
         
     | 
| 
      
 430 
     | 
    
         
            +
                else:
         
     | 
| 
      
 431 
     | 
    
         
            +
                    bootstrap_room_span_context = remote_trace_contexts[
         
     | 
| 
      
 432 
     | 
    
         
            +
                        str(bootstrap_room)
         
     | 
| 
      
 433 
     | 
    
         
            +
                    ].root_span_context
         
     | 
| 
      
 434 
     | 
    
         
            +
             
     | 
| 
       355 
435 
     | 
    
         
             
                # Drop the worker_id added by MultiTokenizer
         
     | 
| 
       356 
436 
     | 
    
         
             
                orig_rid = rid.split("_")[-1]
         
     | 
| 
       357 
     | 
    
         
            -
                 
     | 
| 
      
 437 
     | 
    
         
            +
                role = "" if role == "null" else role
         
     | 
| 
      
 438 
     | 
    
         
            +
                attrs = {"rid": orig_rid}
         
     | 
| 
       358 
439 
     | 
    
         
             
                root_span = tracer.start_span(
         
     | 
| 
       359 
     | 
    
         
            -
                    name=f"Req {orig_rid[:8]}",
         
     | 
| 
      
 440 
     | 
    
         
            +
                    name=f"{role} Req {orig_rid[:8]}",
         
     | 
| 
       360 
441 
     | 
    
         
             
                    start_time=ts,
         
     | 
| 
      
 442 
     | 
    
         
            +
                    context=bootstrap_room_span_context,
         
     | 
| 
      
 443 
     | 
    
         
            +
                    attributes=attrs,
         
     | 
| 
       361 
444 
     | 
    
         
             
                )
         
     | 
| 
       362 
445 
     | 
    
         | 
| 
       363 
446 
     | 
    
         
             
                root_span.set_attributes(
         
     | 
| 
       364 
447 
     | 
    
         
             
                    {
         
     | 
| 
       365 
448 
     | 
    
         
             
                        "rid": rid,
         
     | 
| 
       366 
     | 
    
         
            -
                        "bootstrap_room": bootstrap_room if bootstrap_room else "None",
         
     | 
| 
       367 
449 
     | 
    
         
             
                    }
         
     | 
| 
       368 
450 
     | 
    
         
             
                )
         
     | 
| 
       369 
451 
     | 
    
         | 
| 
       370 
452 
     | 
    
         
             
                reqs_context[rid].root_span = root_span
         
     | 
| 
       371 
453 
     | 
    
         
             
                reqs_context[rid].root_span_context = trace.set_span_in_context(root_span)
         
     | 
| 
      
 454 
     | 
    
         
            +
                reqs_context[rid].bootstrap_room_span_context = bootstrap_room_span_context
         
     | 
| 
       372 
455 
     | 
    
         | 
| 
       373 
456 
     | 
    
         
             
                # create thread context and thread span
         
     | 
| 
       374 
457 
     | 
    
         
             
                reqs_context[rid].threads_context[pid] = __create_thread_context(
         
     | 
| 
         @@ -376,6 +459,10 @@ def trace_req_start( 
     | 
|
| 
       376 
459 
     | 
    
         
             
                    reqs_context[rid].root_span_context,
         
     | 
| 
       377 
460 
     | 
    
         
             
                    ts,
         
     | 
| 
       378 
461 
     | 
    
         
             
                )
         
     | 
| 
      
 462 
     | 
    
         
            +
                if str(bootstrap_room) in remote_trace_contexts:
         
     | 
| 
      
 463 
     | 
    
         
            +
                    reqs_context[rid].threads_context[pid].last_span_context = (
         
     | 
| 
      
 464 
     | 
    
         
            +
                        remote_trace_contexts[str(bootstrap_room)].prev_span_context
         
     | 
| 
      
 465 
     | 
    
         
            +
                    )
         
     | 
| 
       379 
466 
     | 
    
         | 
| 
       380 
467 
     | 
    
         | 
| 
       381 
468 
     | 
    
         
             
            def trace_req_finish(
         
     | 
| 
         @@ -399,6 +486,10 @@ def trace_req_finish( 
     | 
|
| 
       399 
486 
     | 
    
         
             
                    req_context.root_span.set_attributes(attrs)
         
     | 
| 
       400 
487 
     | 
    
         | 
| 
       401 
488 
     | 
    
         
             
                req_context.root_span.end(end_time=ts)
         
     | 
| 
      
 489 
     | 
    
         
            +
                if str(req_context.bootstrap_room) in remote_trace_contexts:
         
     | 
| 
      
 490 
     | 
    
         
            +
                    del remote_trace_contexts[str(req_context.bootstrap_room)]
         
     | 
| 
      
 491 
     | 
    
         
            +
                else:
         
     | 
| 
      
 492 
     | 
    
         
            +
                    req_context.bootstrap_room_span.end(end_time=ts)
         
     | 
| 
       402 
493 
     | 
    
         | 
| 
       403 
494 
     | 
    
         
             
                del reqs_context[rid]
         
     | 
| 
       404 
495 
     | 
    
         | 
| 
         @@ -518,7 +609,9 @@ trace_slice = trace_slice_end 
     | 
|
| 
       518 
609 
     | 
    
         | 
| 
       519 
610 
     | 
    
         | 
| 
       520 
611 
     | 
    
         
             
            # Add event to the current slice on the same thread with the same rid.
         
     | 
| 
       521 
     | 
    
         
            -
            def trace_event( 
     | 
| 
      
 612 
     | 
    
         
            +
            def trace_event(
         
     | 
| 
      
 613 
     | 
    
         
            +
                name: str, rid: str, ts: Optional[int] = None, attrs: Dict[str, Any] = None
         
     | 
| 
      
 614 
     | 
    
         
            +
            ):
         
     | 
| 
       522 
615 
     | 
    
         
             
                if not tracing_enabled:
         
     | 
| 
       523 
616 
     | 
    
         
             
                    return
         
     | 
| 
       524 
617 
     | 
    
         | 
| 
         @@ -539,7 +632,7 @@ def trace_event(name: str, rid: str, ts: Optional[int] = None): 
     | 
|
| 
       539 
632 
     | 
    
         
             
                ts = ts or __get_cur_time_ns()
         
     | 
| 
       540 
633 
     | 
    
         | 
| 
       541 
634 
     | 
    
         
             
                slice_info = thread_context.cur_slice_stack[-1]
         
     | 
| 
       542 
     | 
    
         
            -
                slice_info.span.add_event(name=name, timestamp=ts)
         
     | 
| 
      
 635 
     | 
    
         
            +
                slice_info.span.add_event(name=name, timestamp=ts, attributes=attrs)
         
     | 
| 
       543 
636 
     | 
    
         | 
| 
       544 
637 
     | 
    
         | 
| 
       545 
638 
     | 
    
         
             
            # Add attrs to the current slice on the same thread with the same rid.
         
     | 
| 
         @@ -569,6 +662,9 @@ def trace_slice_batch( 
     | 
|
| 
       569 
662 
     | 
    
         
             
                name: str,
         
     | 
| 
       570 
663 
     | 
    
         
             
                reqs: List[Req],
         
     | 
| 
       571 
664 
     | 
    
         
             
            ):
         
     | 
| 
      
 665 
     | 
    
         
            +
                if not tracing_enabled:
         
     | 
| 
      
 666 
     | 
    
         
            +
                    return
         
     | 
| 
      
 667 
     | 
    
         
            +
             
     | 
| 
       572 
668 
     | 
    
         
             
                for req in reqs:
         
     | 
| 
       573 
669 
     | 
    
         
             
                    trace_slice(
         
     | 
| 
       574 
670 
     | 
    
         
             
                        name,
         
     | 
| 
         @@ -576,3 +672,16 @@ def trace_slice_batch( 
     | 
|
| 
       576 
672 
     | 
    
         
             
                        auto_next_anon=not req.finished(),
         
     | 
| 
       577 
673 
     | 
    
         
             
                        thread_finish_flag=req.finished(),
         
     | 
| 
       578 
674 
     | 
    
         
             
                    )
         
     | 
| 
      
 675 
     | 
    
         
            +
             
     | 
| 
      
 676 
     | 
    
         
            +
             
     | 
| 
      
 677 
     | 
    
         
            +
            def trace_event_batch(
         
     | 
| 
      
 678 
     | 
    
         
            +
                name: str,
         
     | 
| 
      
 679 
     | 
    
         
            +
                reqs: List[Req],
         
     | 
| 
      
 680 
     | 
    
         
            +
                ts: Optional[int] = None,
         
     | 
| 
      
 681 
     | 
    
         
            +
                attrs: Dict[str, Any] = None,
         
     | 
| 
      
 682 
     | 
    
         
            +
            ):
         
     | 
| 
      
 683 
     | 
    
         
            +
                if not tracing_enabled:
         
     | 
| 
      
 684 
     | 
    
         
            +
                    return
         
     | 
| 
      
 685 
     | 
    
         
            +
             
     | 
| 
      
 686 
     | 
    
         
            +
                for req in reqs:
         
     | 
| 
      
 687 
     | 
    
         
            +
                    trace_event(name, req.rid, ts=ts, attrs=attrs)
         
     | 
    
        sglang/srt/utils/common.py
    CHANGED
    
    | 
         @@ -56,7 +56,6 @@ from json import JSONDecodeError 
     | 
|
| 
       56 
56 
     | 
    
         
             
            from multiprocessing.reduction import ForkingPickler
         
     | 
| 
       57 
57 
     | 
    
         
             
            from pathlib import Path
         
     | 
| 
       58 
58 
     | 
    
         
             
            from typing import (
         
     | 
| 
       59 
     | 
    
         
            -
                TYPE_CHECKING,
         
     | 
| 
       60 
59 
     | 
    
         
             
                Any,
         
     | 
| 
       61 
60 
     | 
    
         
             
                Callable,
         
     | 
| 
       62 
61 
     | 
    
         
             
                Dict,
         
     | 
| 
         @@ -94,9 +93,6 @@ from typing_extensions import Literal 
     | 
|
| 
       94 
93 
     | 
    
         
             
            from sglang.srt.environ import envs
         
     | 
| 
       95 
94 
     | 
    
         
             
            from sglang.srt.metrics.func_timer import enable_func_timer
         
     | 
| 
       96 
95 
     | 
    
         | 
| 
       97 
     | 
    
         
            -
            if TYPE_CHECKING:
         
     | 
| 
       98 
     | 
    
         
            -
                from sglang.srt.layers.quantization.base_config import QuantizeMethodBase
         
     | 
| 
       99 
     | 
    
         
            -
             
     | 
| 
       100 
96 
     | 
    
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 
       101 
97 
     | 
    
         | 
| 
       102 
98 
     | 
    
         
             
            show_time_cost = False
         
     | 
| 
         @@ -138,6 +134,7 @@ def is_xpu() -> bool: 
     | 
|
| 
       138 
134 
     | 
    
         
             
                return hasattr(torch, "xpu") and torch.xpu.is_available()
         
     | 
| 
       139 
135 
     | 
    
         | 
| 
       140 
136 
     | 
    
         | 
| 
      
 137 
     | 
    
         
            +
            @lru_cache(maxsize=1)
         
     | 
| 
       141 
138 
     | 
    
         
             
            def is_npu() -> bool:
         
     | 
| 
       142 
139 
     | 
    
         
             
                return hasattr(torch, "npu") and torch.npu.is_available()
         
     | 
| 
       143 
140 
     | 
    
         | 
| 
         @@ -191,7 +188,16 @@ is_hopper_with_cuda_12_3 = lambda: _check(9) 
     | 
|
| 
       191 
188 
     | 
    
         
             
            def is_blackwell():
         
     | 
| 
       192 
189 
     | 
    
         
             
                if not is_cuda():
         
     | 
| 
       193 
190 
     | 
    
         
             
                    return False
         
     | 
| 
       194 
     | 
    
         
            -
                return torch.cuda.get_device_capability()[0]  
     | 
| 
      
 191 
     | 
    
         
            +
                return torch.cuda.get_device_capability()[0] in [10, 12]
         
     | 
| 
      
 192 
     | 
    
         
            +
             
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
            @lru_cache(maxsize=1)
         
     | 
| 
      
 195 
     | 
    
         
            +
            def is_blackwell_supported(device=None) -> bool:
         
     | 
| 
      
 196 
     | 
    
         
            +
                if not is_cuda_alike():
         
     | 
| 
      
 197 
     | 
    
         
            +
                    return False
         
     | 
| 
      
 198 
     | 
    
         
            +
                return (torch.cuda.get_device_capability(device)[0] in [10, 12]) and (
         
     | 
| 
      
 199 
     | 
    
         
            +
                    torch.version.cuda >= "12.8"
         
     | 
| 
      
 200 
     | 
    
         
            +
                )
         
     | 
| 
       195 
201 
     | 
    
         | 
| 
       196 
202 
     | 
    
         | 
| 
       197 
203 
     | 
    
         
             
            @lru_cache(maxsize=1)
         
     | 
| 
         @@ -1069,32 +1075,6 @@ def monkey_patch_p2p_access_check(): 
     | 
|
| 
       1069 
1075 
     | 
    
         
             
                setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
         
     | 
| 
       1070 
1076 
     | 
    
         | 
| 
       1071 
1077 
     | 
    
         | 
| 
       1072 
     | 
    
         
            -
            def monkey_patch_vllm_gguf_config():
         
     | 
| 
       1073 
     | 
    
         
            -
                try:
         
     | 
| 
       1074 
     | 
    
         
            -
                    from vllm.model_executor.layers.quantization.gguf import (
         
     | 
| 
       1075 
     | 
    
         
            -
                        GGUFConfig,
         
     | 
| 
       1076 
     | 
    
         
            -
                        GGUFEmbeddingMethod,
         
     | 
| 
       1077 
     | 
    
         
            -
                        GGUFLinearMethod,
         
     | 
| 
       1078 
     | 
    
         
            -
                    )
         
     | 
| 
       1079 
     | 
    
         
            -
                except ImportError:
         
     | 
| 
       1080 
     | 
    
         
            -
                    return
         
     | 
| 
       1081 
     | 
    
         
            -
             
     | 
| 
       1082 
     | 
    
         
            -
                from sglang.srt.layers.linear import LinearBase
         
     | 
| 
       1083 
     | 
    
         
            -
                from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
         
     | 
| 
       1084 
     | 
    
         
            -
             
     | 
| 
       1085 
     | 
    
         
            -
                def get_quant_method_with_embedding_replaced(
         
     | 
| 
       1086 
     | 
    
         
            -
                    self, layer: torch.nn.Module, prefix: str
         
     | 
| 
       1087 
     | 
    
         
            -
                ) -> Optional[QuantizeMethodBase]:
         
     | 
| 
       1088 
     | 
    
         
            -
                    if isinstance(layer, LinearBase):
         
     | 
| 
       1089 
     | 
    
         
            -
                        return GGUFLinearMethod(self)
         
     | 
| 
       1090 
     | 
    
         
            -
                    elif isinstance(layer, VocabParallelEmbedding):
         
     | 
| 
       1091 
     | 
    
         
            -
                        # patch to own VocabParallelEmbedding
         
     | 
| 
       1092 
     | 
    
         
            -
                        return GGUFEmbeddingMethod(self)
         
     | 
| 
       1093 
     | 
    
         
            -
                    return None
         
     | 
| 
       1094 
     | 
    
         
            -
             
     | 
| 
       1095 
     | 
    
         
            -
                setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
         
     | 
| 
       1096 
     | 
    
         
            -
             
     | 
| 
       1097 
     | 
    
         
            -
             
     | 
| 
       1098 
1078 
     | 
    
         
             
            def set_ulimit(target_soft_limit=65535):
         
     | 
| 
       1099 
1079 
     | 
    
         
             
                # number of open files
         
     | 
| 
       1100 
1080 
     | 
    
         
             
                resource_type = resource.RLIMIT_NOFILE
         
     | 
| 
         @@ -1131,9 +1111,9 @@ def add_api_key_middleware(app, api_key: str): 
     | 
|
| 
       1131 
1111 
     | 
    
         
             
                async def authentication(request, call_next):
         
     | 
| 
       1132 
1112 
     | 
    
         
             
                    if request.method == "OPTIONS":
         
     | 
| 
       1133 
1113 
     | 
    
         
             
                        return await call_next(request)
         
     | 
| 
       1134 
     | 
    
         
            -
                    if request.url.path.startswith("/health") 
     | 
| 
       1135 
     | 
    
         
            -
                         
     | 
| 
       1136 
     | 
    
         
            -
                     
     | 
| 
      
 1114 
     | 
    
         
            +
                    if request.url.path.startswith("/health") or request.url.path.startswith(
         
     | 
| 
      
 1115 
     | 
    
         
            +
                        "/metrics"
         
     | 
| 
      
 1116 
     | 
    
         
            +
                    ):
         
     | 
| 
       1137 
1117 
     | 
    
         
             
                        return await call_next(request)
         
     | 
| 
       1138 
1118 
     | 
    
         
             
                    if request.headers.get("Authorization") != "Bearer " + api_key:
         
     | 
| 
       1139 
1119 
     | 
    
         
             
                        return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
         
     | 
| 
         @@ -1259,42 +1239,34 @@ def point_to_point_pyobj( 
     | 
|
| 
       1259 
1239 
     | 
    
         
             
                dst: int = 1,
         
     | 
| 
       1260 
1240 
     | 
    
         
             
            ):
         
     | 
| 
       1261 
1241 
     | 
    
         
             
                """Send data from src to dst in group using DeviceToDevice communication."""
         
     | 
| 
       1262 
     | 
    
         
            -
             
     | 
| 
      
 1242 
     | 
    
         
            +
                device = torch.get_device_module().current_device()
         
     | 
| 
       1263 
1243 
     | 
    
         
             
                if rank == src:
         
     | 
| 
       1264 
1244 
     | 
    
         
             
                    if len(data) == 0:
         
     | 
| 
       1265 
     | 
    
         
            -
                        tensor_size = torch.tensor(
         
     | 
| 
       1266 
     | 
    
         
            -
                            [0], dtype=torch.long, device=torch.cuda.current_device()
         
     | 
| 
       1267 
     | 
    
         
            -
                        )
         
     | 
| 
      
 1245 
     | 
    
         
            +
                        tensor_size = torch.tensor([0], dtype=torch.long, device=device)
         
     | 
| 
       1268 
1246 
     | 
    
         
             
                        dist.send(tensor_size, dst=dst, group=group)
         
     | 
| 
       1269 
1247 
     | 
    
         
             
                    else:
         
     | 
| 
       1270 
1248 
     | 
    
         
             
                        serialized_data = pickle.dumps(data)
         
     | 
| 
       1271 
1249 
     | 
    
         
             
                        size = len(serialized_data)
         
     | 
| 
       1272 
1250 
     | 
    
         
             
                        tensor_data = torch.ByteTensor(
         
     | 
| 
       1273 
1251 
     | 
    
         
             
                            np.frombuffer(serialized_data, dtype=np.uint8)
         
     | 
| 
       1274 
     | 
    
         
            -
                        ). 
     | 
| 
       1275 
     | 
    
         
            -
                            device= 
     | 
| 
      
 1252 
     | 
    
         
            +
                        ).to(
         
     | 
| 
      
 1253 
     | 
    
         
            +
                            device=device
         
     | 
| 
       1276 
1254 
     | 
    
         
             
                        )  # Move to GPU
         
     | 
| 
       1277 
     | 
    
         
            -
                        tensor_size = torch.tensor(
         
     | 
| 
       1278 
     | 
    
         
            -
                            [size], dtype=torch.long, device=torch.cuda.current_device()
         
     | 
| 
       1279 
     | 
    
         
            -
                        )
         
     | 
| 
      
 1255 
     | 
    
         
            +
                        tensor_size = torch.tensor([size], dtype=torch.long, device=device)
         
     | 
| 
       1280 
1256 
     | 
    
         | 
| 
       1281 
1257 
     | 
    
         
             
                        dist.send(tensor_size, dst=dst, group=group)
         
     | 
| 
       1282 
1258 
     | 
    
         
             
                        dist.send(tensor_data, dst=dst, group=group)
         
     | 
| 
       1283 
1259 
     | 
    
         
             
                    return data
         
     | 
| 
       1284 
1260 
     | 
    
         | 
| 
       1285 
1261 
     | 
    
         
             
                elif rank == dst:
         
     | 
| 
       1286 
     | 
    
         
            -
                    tensor_size = torch.tensor(
         
     | 
| 
       1287 
     | 
    
         
            -
                        [0], dtype=torch.long, device=torch.cuda.current_device()
         
     | 
| 
       1288 
     | 
    
         
            -
                    )
         
     | 
| 
      
 1262 
     | 
    
         
            +
                    tensor_size = torch.tensor([0], dtype=torch.long, device=device)
         
     | 
| 
       1289 
1263 
     | 
    
         
             
                    dist.recv(tensor_size, src=src, group=group)
         
     | 
| 
       1290 
1264 
     | 
    
         
             
                    size = tensor_size.item()
         
     | 
| 
       1291 
1265 
     | 
    
         | 
| 
       1292 
1266 
     | 
    
         
             
                    if size == 0:
         
     | 
| 
       1293 
1267 
     | 
    
         
             
                        return []
         
     | 
| 
       1294 
1268 
     | 
    
         | 
| 
       1295 
     | 
    
         
            -
                    tensor_data = torch.empty(
         
     | 
| 
       1296 
     | 
    
         
            -
                        size, dtype=torch.uint8, device=torch.cuda.current_device()
         
     | 
| 
       1297 
     | 
    
         
            -
                    )
         
     | 
| 
      
 1269 
     | 
    
         
            +
                    tensor_data = torch.empty(size, dtype=torch.uint8, device=device)
         
     | 
| 
       1298 
1270 
     | 
    
         
             
                    dist.recv(tensor_data, src=src, group=group)
         
     | 
| 
       1299 
1271 
     | 
    
         | 
| 
       1300 
1272 
     | 
    
         
             
                    serialized_data = bytes(
         
     | 
| 
         @@ -2106,7 +2078,7 @@ class MultiprocessingSerializer: 
     | 
|
| 
       2106 
2078 
     | 
    
         | 
| 
       2107 
2079 
     | 
    
         
             
                    if output_str:
         
     | 
| 
       2108 
2080 
     | 
    
         
             
                        # Convert bytes to base64-encoded string
         
     | 
| 
       2109 
     | 
    
         
            -
                        pybase64.b64encode(output).decode("utf-8")
         
     | 
| 
      
 2081 
     | 
    
         
            +
                        output = pybase64.b64encode(output).decode("utf-8")
         
     | 
| 
       2110 
2082 
     | 
    
         | 
| 
       2111 
2083 
     | 
    
         
             
                    return output
         
     | 
| 
       2112 
2084 
     | 
    
         | 
| 
         @@ -2125,7 +2097,78 @@ class MultiprocessingSerializer: 
     | 
|
| 
       2125 
2097 
     | 
    
         
             
                        # Decode base64 string to bytes
         
     | 
| 
       2126 
2098 
     | 
    
         
             
                        data = pybase64.b64decode(data, validate=True)
         
     | 
| 
       2127 
2099 
     | 
    
         | 
| 
       2128 
     | 
    
         
            -
                    return  
     | 
| 
      
 2100 
     | 
    
         
            +
                    return SafeUnpickler(io.BytesIO(data)).load()
         
     | 
| 
      
 2101 
     | 
    
         
            +
             
     | 
| 
      
 2102 
     | 
    
         
            +
             
     | 
| 
      
 2103 
     | 
    
         
            +
            class SafeUnpickler(pickle.Unpickler):
         
     | 
| 
      
 2104 
     | 
    
         
            +
                ALLOWED_MODULE_PREFIXES = {
         
     | 
| 
      
 2105 
     | 
    
         
            +
                    # --- Python types ---
         
     | 
| 
      
 2106 
     | 
    
         
            +
                    "builtins.",
         
     | 
| 
      
 2107 
     | 
    
         
            +
                    "collections.",
         
     | 
| 
      
 2108 
     | 
    
         
            +
                    "copyreg.",
         
     | 
| 
      
 2109 
     | 
    
         
            +
                    "functools.",
         
     | 
| 
      
 2110 
     | 
    
         
            +
                    "itertools.",
         
     | 
| 
      
 2111 
     | 
    
         
            +
                    "operator.",
         
     | 
| 
      
 2112 
     | 
    
         
            +
                    "types.",
         
     | 
| 
      
 2113 
     | 
    
         
            +
                    "weakref.",
         
     | 
| 
      
 2114 
     | 
    
         
            +
                    # --- PyTorch types ---
         
     | 
| 
      
 2115 
     | 
    
         
            +
                    "torch.",
         
     | 
| 
      
 2116 
     | 
    
         
            +
                    "torch._tensor.",
         
     | 
| 
      
 2117 
     | 
    
         
            +
                    "torch.storage.",
         
     | 
| 
      
 2118 
     | 
    
         
            +
                    "torch.nn.parameter.",
         
     | 
| 
      
 2119 
     | 
    
         
            +
                    "torch.autograd.function.",
         
     | 
| 
      
 2120 
     | 
    
         
            +
                    # --- torch distributed ---
         
     | 
| 
      
 2121 
     | 
    
         
            +
                    "torch.distributed.",
         
     | 
| 
      
 2122 
     | 
    
         
            +
                    "torch.distributed._shard.",
         
     | 
| 
      
 2123 
     | 
    
         
            +
                    "torch.distributed._composable.",
         
     | 
| 
      
 2124 
     | 
    
         
            +
                    "torch._C._distributed_c10d.",
         
     | 
| 
      
 2125 
     | 
    
         
            +
                    "torch._C._distributed_fsdp.",
         
     | 
| 
      
 2126 
     | 
    
         
            +
                    "torch.distributed.optim.",
         
     | 
| 
      
 2127 
     | 
    
         
            +
                    # --- multiprocessing ---
         
     | 
| 
      
 2128 
     | 
    
         
            +
                    "multiprocessing.resource_sharer.",
         
     | 
| 
      
 2129 
     | 
    
         
            +
                    "multiprocessing.reduction.",
         
     | 
| 
      
 2130 
     | 
    
         
            +
                    "pickletools.",
         
     | 
| 
      
 2131 
     | 
    
         
            +
                    # --- PEFT / LoRA ---
         
     | 
| 
      
 2132 
     | 
    
         
            +
                    "peft.",
         
     | 
| 
      
 2133 
     | 
    
         
            +
                    "transformers.",
         
     | 
| 
      
 2134 
     | 
    
         
            +
                    "huggingface_hub.",
         
     | 
| 
      
 2135 
     | 
    
         
            +
                    # --- SGLang & Unitest ---
         
     | 
| 
      
 2136 
     | 
    
         
            +
                    "sglang.srt.weight_sync.tensor_bucket.",
         
     | 
| 
      
 2137 
     | 
    
         
            +
                    "sglang.srt.model_executor.model_runner.",
         
     | 
| 
      
 2138 
     | 
    
         
            +
                    "sglang.srt.layers.",
         
     | 
| 
      
 2139 
     | 
    
         
            +
                    "sglang.srt.utils.",
         
     | 
| 
      
 2140 
     | 
    
         
            +
                }
         
     | 
| 
      
 2141 
     | 
    
         
            +
             
     | 
| 
      
 2142 
     | 
    
         
            +
                DENY_CLASSES = {
         
     | 
| 
      
 2143 
     | 
    
         
            +
                    ("builtins", "eval"),
         
     | 
| 
      
 2144 
     | 
    
         
            +
                    ("builtins", "exec"),
         
     | 
| 
      
 2145 
     | 
    
         
            +
                    ("builtins", "compile"),
         
     | 
| 
      
 2146 
     | 
    
         
            +
                    ("os", "system"),
         
     | 
| 
      
 2147 
     | 
    
         
            +
                    ("subprocess", "Popen"),
         
     | 
| 
      
 2148 
     | 
    
         
            +
                    ("subprocess", "run"),
         
     | 
| 
      
 2149 
     | 
    
         
            +
                    ("codecs", "decode"),
         
     | 
| 
      
 2150 
     | 
    
         
            +
                    ("types", "CodeType"),
         
     | 
| 
      
 2151 
     | 
    
         
            +
                    ("types", "FunctionType"),
         
     | 
| 
      
 2152 
     | 
    
         
            +
                }
         
     | 
| 
      
 2153 
     | 
    
         
            +
             
     | 
| 
      
 2154 
     | 
    
         
            +
                def find_class(self, module, name):
         
     | 
| 
      
 2155 
     | 
    
         
            +
                    # Block deterministic attacks
         
     | 
| 
      
 2156 
     | 
    
         
            +
                    if (module, name) in self.DENY_CLASSES:
         
     | 
| 
      
 2157 
     | 
    
         
            +
                        raise RuntimeError(
         
     | 
| 
      
 2158 
     | 
    
         
            +
                            f"Blocked unsafe class loading ({module}.{name}), "
         
     | 
| 
      
 2159 
     | 
    
         
            +
                            f"to prevent exploitation of CVE-2025-10164"
         
     | 
| 
      
 2160 
     | 
    
         
            +
                        )
         
     | 
| 
      
 2161 
     | 
    
         
            +
                    # Allowlist of safe-to-load modules.
         
     | 
| 
      
 2162 
     | 
    
         
            +
                    if any(
         
     | 
| 
      
 2163 
     | 
    
         
            +
                        (module + ".").startswith(prefix) for prefix in self.ALLOWED_MODULE_PREFIXES
         
     | 
| 
      
 2164 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 2165 
     | 
    
         
            +
                        return super().find_class(module, name)
         
     | 
| 
      
 2166 
     | 
    
         
            +
             
     | 
| 
      
 2167 
     | 
    
         
            +
                    # Block everything else. (Potential attack surface)
         
     | 
| 
      
 2168 
     | 
    
         
            +
                    raise RuntimeError(
         
     | 
| 
      
 2169 
     | 
    
         
            +
                        f"Blocked unsafe class loading ({module}.{name}), "
         
     | 
| 
      
 2170 
     | 
    
         
            +
                        f"to prevent exploitation of CVE-2025-10164"
         
     | 
| 
      
 2171 
     | 
    
         
            +
                    )
         
     | 
| 
       2129 
2172 
     | 
    
         | 
| 
       2130 
2173 
     | 
    
         | 
| 
       2131 
2174 
     | 
    
         
             
            def debug_timing(func):
         
     | 
| 
         @@ -2308,16 +2351,24 @@ def launch_dummy_health_check_server(host, port, enable_metrics): 
     | 
|
| 
       2308 
2351 
     | 
    
         
             
                )
         
     | 
| 
       2309 
2352 
     | 
    
         
             
                server = uvicorn.Server(config=config)
         
     | 
| 
       2310 
2353 
     | 
    
         | 
| 
       2311 
     | 
    
         
            -
                 
     | 
| 
       2312 
     | 
    
         
            -
             
     | 
| 
       2313 
     | 
    
         
            -
             
     | 
| 
       2314 
     | 
    
         
            -
             
     | 
| 
       2315 
     | 
    
         
            -
             
     | 
| 
       2316 
     | 
    
         
            -
                     
     | 
| 
      
 2354 
     | 
    
         
            +
                # Run server in a background daemon thread with its own event loop
         
     | 
| 
      
 2355 
     | 
    
         
            +
                # This prevents blocking the main thread while still serving health checks
         
     | 
| 
      
 2356 
     | 
    
         
            +
                def run_server():
         
     | 
| 
      
 2357 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 2358 
     | 
    
         
            +
                        asyncio.run(server.serve())
         
     | 
| 
      
 2359 
     | 
    
         
            +
                    except Exception as e:
         
     | 
| 
      
 2360 
     | 
    
         
            +
                        logger.error(f"Dummy health check server failed to start: {e}")
         
     | 
| 
      
 2361 
     | 
    
         
            +
                        raise
         
     | 
| 
      
 2362 
     | 
    
         
            +
                    finally:
         
     | 
| 
      
 2363 
     | 
    
         
            +
                        logger.info(f"Dummy health check server stopped at {host}:{port}")
         
     | 
| 
       2317 
2364 
     | 
    
         | 
| 
       2318 
     | 
    
         
            -
                 
     | 
| 
       2319 
     | 
    
         
            -
                     
     | 
| 
       2320 
     | 
    
         
            -
             
     | 
| 
      
 2365 
     | 
    
         
            +
                thread = threading.Thread(
         
     | 
| 
      
 2366 
     | 
    
         
            +
                    target=run_server, daemon=True, name="health-check-server"
         
     | 
| 
      
 2367 
     | 
    
         
            +
                )
         
     | 
| 
      
 2368 
     | 
    
         
            +
                thread.start()
         
     | 
| 
      
 2369 
     | 
    
         
            +
                logger.info(
         
     | 
| 
      
 2370 
     | 
    
         
            +
                    f"Dummy health check server started in background thread at {host}:{port}"
         
     | 
| 
      
 2371 
     | 
    
         
            +
                )
         
     | 
| 
       2321 
2372 
     | 
    
         | 
| 
       2322 
2373 
     | 
    
         | 
| 
       2323 
2374 
     | 
    
         
             
            def create_checksum(directory: str):
         
     | 
| 
         @@ -2578,17 +2629,12 @@ def get_local_ip_auto(fallback: str = None) -> str: 
     | 
|
| 
       2578 
2629 
     | 
    
         
             
                raise ValueError("Can not get local ip")
         
     | 
| 
       2579 
2630 
     | 
    
         | 
| 
       2580 
2631 
     | 
    
         | 
| 
       2581 
     | 
    
         
            -
            def is_page_size_one(server_args):
         
     | 
| 
       2582 
     | 
    
         
            -
                return server_args.page_size == 1
         
     | 
| 
       2583 
     | 
    
         
            -
             
     | 
| 
       2584 
     | 
    
         
            -
             
     | 
| 
       2585 
2632 
     | 
    
         
             
            # TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
         
     | 
| 
       2586 
2633 
     | 
    
         
             
            # TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
         
     | 
| 
       2587 
2634 
     | 
    
         
             
            def is_no_spec_infer_or_topk_one(server_args):
         
     | 
| 
       2588 
2635 
     | 
    
         
             
                return server_args.speculative_eagle_topk is None or (
         
     | 
| 
       2589 
     | 
    
         
            -
                    server_args.speculative_eagle_topk  
     | 
| 
       2590 
     | 
    
         
            -
                    and server_args. 
     | 
| 
       2591 
     | 
    
         
            -
                    and is_page_size_one(server_args)
         
     | 
| 
      
 2636 
     | 
    
         
            +
                    server_args.speculative_eagle_topk == 1
         
     | 
| 
      
 2637 
     | 
    
         
            +
                    and (server_args.page_size == 1 or server_args.page_size is None)
         
     | 
| 
       2592 
2638 
     | 
    
         
             
                )
         
     | 
| 
       2593 
2639 
     | 
    
         | 
| 
       2594 
2640 
     | 
    
         | 
| 
         @@ -3068,12 +3114,16 @@ def apply_module_patch(target_module, target_function, wrappers): 
     | 
|
| 
       3068 
3114 
     | 
    
         
             
                    setattr(original_module, target_function, candidate)
         
     | 
| 
       3069 
3115 
     | 
    
         | 
| 
       3070 
3116 
     | 
    
         
             
                for key, value in sys.modules.copy().items():
         
     | 
| 
       3071 
     | 
    
         
            -
                     
     | 
| 
       3072 
     | 
    
         
            -
                         
     | 
| 
       3073 
     | 
    
         
            -
             
     | 
| 
       3074 
     | 
    
         
            -
             
     | 
| 
       3075 
     | 
    
         
            -
             
     | 
| 
       3076 
     | 
    
         
            -
                         
     | 
| 
      
 3117 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 3118 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 3119 
     | 
    
         
            +
                            target_function is not None
         
     | 
| 
      
 3120 
     | 
    
         
            +
                            and hasattr(value, target_function)
         
     | 
| 
      
 3121 
     | 
    
         
            +
                            and id(getattr(value, target_function)) == original_function_id
         
     | 
| 
      
 3122 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 3123 
     | 
    
         
            +
                            setattr(value, target_function, candidate)
         
     | 
| 
      
 3124 
     | 
    
         
            +
                    except ImportError as e:
         
     | 
| 
      
 3125 
     | 
    
         
            +
                        # Ignore some modules reporting ImportError when calling hasattr
         
     | 
| 
      
 3126 
     | 
    
         
            +
                        logger.warning(f"Ignore {value} reports ImportError with:\n{str(e)}")
         
     | 
| 
       3077 
3127 
     | 
    
         | 
| 
       3078 
3128 
     | 
    
         | 
| 
       3079 
3129 
     | 
    
         
             
            def parse_module_path(module_path, function_name, create_dummy):
         
     | 
| 
         @@ -3525,6 +3575,24 @@ def cached_triton_kernel(key_fn=None): 
     | 
|
| 
       3525 
3575 
     | 
    
         
             
                """
         
     | 
| 
       3526 
3576 
     | 
    
         | 
| 
       3527 
3577 
     | 
    
         
             
                def decorator(fn):
         
     | 
| 
       3528 
     | 
    
         
            -
                     
     | 
| 
      
 3578 
     | 
    
         
            +
                    if envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.get():
         
     | 
| 
      
 3579 
     | 
    
         
            +
                        logger.debug(
         
     | 
| 
      
 3580 
     | 
    
         
            +
                            f"{envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.name} = True. Using custom triton kernel cache."
         
     | 
| 
      
 3581 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3582 
     | 
    
         
            +
                        return CachedKernel(fn, key_fn)
         
     | 
| 
      
 3583 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 3584 
     | 
    
         
            +
                        # Fallback to the native triton cache.
         
     | 
| 
      
 3585 
     | 
    
         
            +
                        logger.debug(
         
     | 
| 
      
 3586 
     | 
    
         
            +
                            f"{envs.SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE.name} = False. Using native triton kernel cache."
         
     | 
| 
      
 3587 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3588 
     | 
    
         
            +
                        return fn
         
     | 
| 
       3529 
3589 
     | 
    
         | 
| 
       3530 
3590 
     | 
    
         
             
                return decorator
         
     | 
| 
      
 3591 
     | 
    
         
            +
             
     | 
| 
      
 3592 
     | 
    
         
            +
             
     | 
| 
      
 3593 
     | 
    
         
            +
            # Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
         
     | 
| 
      
 3594 
     | 
    
         
            +
            def calc_diff(x, y):
         
     | 
| 
      
 3595 
     | 
    
         
            +
                x, y = x.double(), y.double()
         
     | 
| 
      
 3596 
     | 
    
         
            +
                denominator = (x * x + y * y).sum()
         
     | 
| 
      
 3597 
     | 
    
         
            +
                sim = 2 * (x * y).sum() / denominator
         
     | 
| 
      
 3598 
     | 
    
         
            +
                return 1 - sim
         
     | 
| 
         @@ -43,6 +43,7 @@ from sglang.srt.configs import ( 
     | 
|
| 
       43 
43 
     | 
    
         
             
                DotsVLMConfig,
         
     | 
| 
       44 
44 
     | 
    
         
             
                ExaoneConfig,
         
     | 
| 
       45 
45 
     | 
    
         
             
                FalconH1Config,
         
     | 
| 
      
 46 
     | 
    
         
            +
                KimiLinearConfig,
         
     | 
| 
       46 
47 
     | 
    
         
             
                KimiVLConfig,
         
     | 
| 
       47 
48 
     | 
    
         
             
                LongcatFlashConfig,
         
     | 
| 
       48 
49 
     | 
    
         
             
                MultiModalityConfig,
         
     | 
| 
         @@ -54,6 +55,7 @@ from sglang.srt.configs import ( 
     | 
|
| 
       54 
55 
     | 
    
         
             
            from sglang.srt.configs.deepseek_ocr import DeepseekVLV2Config
         
     | 
| 
       55 
56 
     | 
    
         
             
            from sglang.srt.configs.internvl import InternVLChatConfig
         
     | 
| 
       56 
57 
     | 
    
         
             
            from sglang.srt.connector import create_remote_connector
         
     | 
| 
      
 58 
     | 
    
         
            +
            from sglang.srt.multimodal.customized_mm_processor_utils import _CUSTOMIZED_MM_PROCESSOR
         
     | 
| 
       57 
59 
     | 
    
         
             
            from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
         
     | 
| 
       58 
60 
     | 
    
         | 
| 
       59 
61 
     | 
    
         
             
            _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
         
     | 
| 
         @@ -67,6 +69,7 @@ _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [ 
     | 
|
| 
       67 
69 
     | 
    
         
             
                Step3VLConfig,
         
     | 
| 
       68 
70 
     | 
    
         
             
                LongcatFlashConfig,
         
     | 
| 
       69 
71 
     | 
    
         
             
                Olmo3Config,
         
     | 
| 
      
 72 
     | 
    
         
            +
                KimiLinearConfig,
         
     | 
| 
       70 
73 
     | 
    
         
             
                Qwen3NextConfig,
         
     | 
| 
       71 
74 
     | 
    
         
             
                FalconH1Config,
         
     | 
| 
       72 
75 
     | 
    
         
             
                DotsVLMConfig,
         
     | 
| 
         @@ -172,6 +175,16 @@ def _load_deepseek_v32_model( 
     | 
|
| 
       172 
175 
     | 
    
         
             
                )
         
     | 
| 
       173 
176 
     | 
    
         | 
| 
       174 
177 
     | 
    
         | 
| 
      
 178 
     | 
    
         
            +
            def _is_deepseek_ocr_model(config: PretrainedConfig) -> bool:
         
     | 
| 
      
 179 
     | 
    
         
            +
                # TODO: Remove this workaround related when AutoConfig correctly identifies deepseek-ocr.
         
     | 
| 
      
 180 
     | 
    
         
            +
                # Hugging Face's AutoConfig currently misidentifies it as deepseekvl2.
         
     | 
| 
      
 181 
     | 
    
         
            +
                return (
         
     | 
| 
      
 182 
     | 
    
         
            +
                    getattr(config, "auto_map", None) is not None
         
     | 
| 
      
 183 
     | 
    
         
            +
                    and config.auto_map.get("AutoModel")
         
     | 
| 
      
 184 
     | 
    
         
            +
                    == "modeling_deepseekocr.DeepseekOCRForCausalLM"
         
     | 
| 
      
 185 
     | 
    
         
            +
                )
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
             
     | 
| 
       175 
188 
     | 
    
         
             
            @lru_cache_frozenset(maxsize=32)
         
     | 
| 
       176 
189 
     | 
    
         
             
            def get_config(
         
     | 
| 
       177 
190 
     | 
    
         
             
                model: str,
         
     | 
| 
         @@ -197,10 +210,6 @@ def get_config( 
     | 
|
| 
       197 
210 
     | 
    
         
             
                    config = AutoConfig.from_pretrained(
         
     | 
| 
       198 
211 
     | 
    
         
             
                        model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
         
     | 
| 
       199 
212 
     | 
    
         
             
                    )
         
     | 
| 
       200 
     | 
    
         
            -
                    if "deepseek-ai/DeepSeek-OCR" in model:
         
     | 
| 
       201 
     | 
    
         
            -
                        config.model_type = "deepseek-ocr"
         
     | 
| 
       202 
     | 
    
         
            -
                        # Due to an unknown reason, Hugging Face’s AutoConfig mistakenly recognizes the configuration of deepseek-ocr as deepseekvl2.
         
     | 
| 
       203 
     | 
    
         
            -
                        # This is a temporary workaround and will require further optimization.
         
     | 
| 
       204 
213 
     | 
    
         | 
| 
       205 
214 
     | 
    
         
             
                except ValueError as e:
         
     | 
| 
       206 
215 
     | 
    
         
             
                    if not "deepseek_v32" in str(e):
         
     | 
| 
         @@ -237,7 +246,11 @@ def get_config( 
     | 
|
| 
       237 
246 
     | 
    
         
             
                            setattr(config, key, val)
         
     | 
| 
       238 
247 
     | 
    
         | 
| 
       239 
248 
     | 
    
         
             
                if config.model_type in _CONFIG_REGISTRY:
         
     | 
| 
       240 
     | 
    
         
            -
                     
     | 
| 
      
 249 
     | 
    
         
            +
                    model_type = config.model_type
         
     | 
| 
      
 250 
     | 
    
         
            +
                    if model_type == "deepseek_vl_v2":
         
     | 
| 
      
 251 
     | 
    
         
            +
                        if _is_deepseek_ocr_model(config):
         
     | 
| 
      
 252 
     | 
    
         
            +
                            model_type = "deepseek-ocr"
         
     | 
| 
      
 253 
     | 
    
         
            +
                    config_class = _CONFIG_REGISTRY[model_type]
         
     | 
| 
       241 
254 
     | 
    
         
             
                    config = config_class.from_pretrained(model, revision=revision)
         
     | 
| 
       242 
255 
     | 
    
         
             
                    # NOTE(HandH1998): Qwen2VL requires `_name_or_path` attribute in `config`.
         
     | 
| 
       243 
256 
     | 
    
         
             
                    setattr(config, "_name_or_path", model)
         
     | 
| 
         @@ -441,6 +454,10 @@ def get_processor( 
     | 
|
| 
       441 
454 
     | 
    
         
             
                    **kwargs,
         
     | 
| 
       442 
455 
     | 
    
         
             
                )
         
     | 
| 
       443 
456 
     | 
    
         | 
| 
      
 457 
     | 
    
         
            +
                if _is_deepseek_ocr_model(config):
         
     | 
| 
      
 458 
     | 
    
         
            +
                    # Temporary hack for load deepseek-ocr
         
     | 
| 
      
 459 
     | 
    
         
            +
                    config.model_type = "deepseek-ocr"
         
     | 
| 
      
 460 
     | 
    
         
            +
             
     | 
| 
       444 
461 
     | 
    
         
             
                # fix: for Qwen2-VL and Sarashina2Vision models, inject default 'size' if not provided.
         
     | 
| 
       445 
462 
     | 
    
         
             
                if config.model_type in {"qwen2_vl", "sarashina2_vision"}:
         
     | 
| 
       446 
463 
     | 
    
         
             
                    if "size" not in kwargs:
         
     | 
| 
         @@ -458,13 +475,22 @@ def get_processor( 
     | 
|
| 
       458 
475 
     | 
    
         
             
                            **kwargs,
         
     | 
| 
       459 
476 
     | 
    
         
             
                        )
         
     | 
| 
       460 
477 
     | 
    
         
             
                    else:
         
     | 
| 
       461 
     | 
    
         
            -
                         
     | 
| 
       462 
     | 
    
         
            -
                             
     | 
| 
       463 
     | 
    
         
            -
             
     | 
| 
       464 
     | 
    
         
            -
             
     | 
| 
       465 
     | 
    
         
            -
             
     | 
| 
       466 
     | 
    
         
            -
             
     | 
| 
       467 
     | 
    
         
            -
             
     | 
| 
      
 478 
     | 
    
         
            +
                        if config.model_type in _CUSTOMIZED_MM_PROCESSOR:
         
     | 
| 
      
 479 
     | 
    
         
            +
                            processor = _CUSTOMIZED_MM_PROCESSOR[config.model_type].from_pretrained(
         
     | 
| 
      
 480 
     | 
    
         
            +
                                tokenizer_name,
         
     | 
| 
      
 481 
     | 
    
         
            +
                                *args,
         
     | 
| 
      
 482 
     | 
    
         
            +
                                trust_remote_code=trust_remote_code,
         
     | 
| 
      
 483 
     | 
    
         
            +
                                revision=revision,
         
     | 
| 
      
 484 
     | 
    
         
            +
                                **kwargs,
         
     | 
| 
      
 485 
     | 
    
         
            +
                            )
         
     | 
| 
      
 486 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 487 
     | 
    
         
            +
                            processor = AutoProcessor.from_pretrained(
         
     | 
| 
      
 488 
     | 
    
         
            +
                                tokenizer_name,
         
     | 
| 
      
 489 
     | 
    
         
            +
                                *args,
         
     | 
| 
      
 490 
     | 
    
         
            +
                                trust_remote_code=trust_remote_code,
         
     | 
| 
      
 491 
     | 
    
         
            +
                                revision=revision,
         
     | 
| 
      
 492 
     | 
    
         
            +
                                **kwargs,
         
     | 
| 
      
 493 
     | 
    
         
            +
                            )
         
     | 
| 
       468 
494 
     | 
    
         | 
| 
       469 
495 
     | 
    
         
             
                except ValueError as e:
         
     | 
| 
       470 
496 
     | 
    
         
             
                    error_message = str(e)
         
     |