sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +149 -34
 - sglang/bench_serving.py +73 -14
 - sglang/compile_deep_gemm.py +13 -7
 - sglang/launch_server.py +2 -0
 - sglang/srt/batch_invariant_ops/__init__.py +2 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
 - sglang/srt/checkpoint_engine/__init__.py +9 -0
 - sglang/srt/checkpoint_engine/update.py +317 -0
 - sglang/srt/compilation/backend.py +1 -1
 - sglang/srt/configs/__init__.py +2 -0
 - sglang/srt/configs/deepseek_ocr.py +542 -10
 - sglang/srt/configs/deepseekvl2.py +95 -194
 - sglang/srt/configs/kimi_linear.py +160 -0
 - sglang/srt/configs/mamba_utils.py +66 -0
 - sglang/srt/configs/model_config.py +30 -7
 - sglang/srt/constants.py +7 -0
 - sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
 - sglang/srt/disaggregation/decode.py +34 -6
 - sglang/srt/disaggregation/nixl/conn.py +2 -2
 - sglang/srt/disaggregation/prefill.py +25 -3
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
 - sglang/srt/distributed/parallel_state.py +9 -12
 - sglang/srt/entrypoints/engine.py +31 -20
 - sglang/srt/entrypoints/grpc_server.py +0 -1
 - sglang/srt/entrypoints/http_server.py +94 -94
 - sglang/srt/entrypoints/openai/protocol.py +7 -1
 - sglang/srt/entrypoints/openai/serving_chat.py +42 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +10 -0
 - sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
 - sglang/srt/environ.py +23 -2
 - sglang/srt/eplb/expert_distribution.py +64 -1
 - sglang/srt/eplb/expert_location.py +106 -36
 - sglang/srt/function_call/function_call_parser.py +2 -0
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/grpc/compile_proto.py +3 -0
 - sglang/srt/layers/activation.py +6 -0
 - sglang/srt/layers/attention/ascend_backend.py +233 -5
 - sglang/srt/layers/attention/attention_registry.py +3 -0
 - sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
 - sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
 - sglang/srt/layers/attention/fla/kda.py +1359 -0
 - sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
 - sglang/srt/layers/attention/flashattention_backend.py +19 -8
 - sglang/srt/layers/attention/flashinfer_backend.py +10 -1
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
 - sglang/srt/layers/attention/flashmla_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
 - sglang/srt/layers/attention/mamba/mamba.py +20 -11
 - sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
 - sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
 - sglang/srt/layers/attention/nsa/transform_index.py +1 -1
 - sglang/srt/layers/attention/nsa_backend.py +157 -23
 - sglang/srt/layers/attention/triton_backend.py +4 -1
 - sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
 - sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
 - sglang/srt/layers/attention/utils.py +78 -0
 - sglang/srt/layers/communicator.py +24 -1
 - sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
 - sglang/srt/layers/layernorm.py +35 -6
 - sglang/srt/layers/logits_processor.py +9 -20
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
 - sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
 - sglang/srt/layers/moe/ep_moe/layer.py +78 -289
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
 - sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
 - sglang/srt/layers/moe/topk.py +35 -10
 - sglang/srt/layers/moe/utils.py +3 -4
 - sglang/srt/layers/pooler.py +21 -2
 - sglang/srt/layers/quantization/__init__.py +13 -84
 - sglang/srt/layers/quantization/auto_round.py +394 -0
 - sglang/srt/layers/quantization/awq.py +0 -3
 - sglang/srt/layers/quantization/base_config.py +7 -0
 - sglang/srt/layers/quantization/fp8.py +68 -63
 - sglang/srt/layers/quantization/fp8_kernel.py +1 -1
 - sglang/srt/layers/quantization/fp8_utils.py +2 -2
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +168 -11
 - sglang/srt/layers/quantization/mxfp4.py +30 -38
 - sglang/srt/layers/quantization/unquant.py +23 -45
 - sglang/srt/layers/quantization/w4afp8.py +38 -2
 - sglang/srt/layers/radix_attention.py +5 -2
 - sglang/srt/layers/rotary_embedding.py +130 -46
 - sglang/srt/layers/sampler.py +12 -1
 - sglang/srt/lora/lora_registry.py +9 -0
 - sglang/srt/managers/async_mm_data_processor.py +122 -0
 - sglang/srt/managers/data_parallel_controller.py +30 -3
 - sglang/srt/managers/detokenizer_manager.py +3 -0
 - sglang/srt/managers/io_struct.py +29 -4
 - sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
 - sglang/srt/managers/schedule_batch.py +74 -15
 - sglang/srt/managers/scheduler.py +185 -144
 - sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
 - sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
 - sglang/srt/managers/scheduler_pp_mixin.py +7 -2
 - sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
 - sglang/srt/managers/session_controller.py +6 -5
 - sglang/srt/managers/tokenizer_manager.py +165 -78
 - sglang/srt/managers/tp_worker.py +24 -1
 - sglang/srt/mem_cache/base_prefix_cache.py +23 -4
 - sglang/srt/mem_cache/common.py +1 -0
 - sglang/srt/mem_cache/hicache_storage.py +7 -1
 - sglang/srt/mem_cache/memory_pool.py +253 -57
 - sglang/srt/mem_cache/memory_pool_host.py +12 -5
 - sglang/srt/mem_cache/radix_cache.py +4 -0
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
 - sglang/srt/metrics/collector.py +46 -3
 - sglang/srt/model_executor/cuda_graph_runner.py +15 -3
 - sglang/srt/model_executor/forward_batch_info.py +55 -14
 - sglang/srt/model_executor/model_runner.py +77 -170
 - sglang/srt/model_executor/npu_graph_runner.py +7 -3
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
 - sglang/srt/model_loader/weight_utils.py +1 -1
 - sglang/srt/models/bailing_moe.py +9 -2
 - sglang/srt/models/deepseek_nextn.py +11 -2
 - sglang/srt/models/deepseek_v2.py +296 -78
 - sglang/srt/models/glm4.py +391 -77
 - sglang/srt/models/glm4_moe.py +322 -354
 - sglang/srt/models/glm4_moe_nextn.py +4 -14
 - sglang/srt/models/glm4v.py +196 -55
 - sglang/srt/models/glm4v_moe.py +29 -197
 - sglang/srt/models/gpt_oss.py +1 -10
 - sglang/srt/models/kimi_linear.py +678 -0
 - sglang/srt/models/llama4.py +1 -1
 - sglang/srt/models/llama_eagle3.py +11 -1
 - sglang/srt/models/longcat_flash.py +2 -2
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/qwen2.py +23 -2
 - sglang/srt/models/qwen2_moe.py +30 -15
 - sglang/srt/models/qwen3.py +35 -5
 - sglang/srt/models/qwen3_moe.py +18 -12
 - sglang/srt/models/qwen3_next.py +7 -0
 - sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
 - sglang/srt/multimodal/processors/base_processor.py +1 -0
 - sglang/srt/multimodal/processors/glm4v.py +1 -1
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
 - sglang/srt/multiplex/multiplexing_mixin.py +209 -0
 - sglang/srt/multiplex/pdmux_context.py +164 -0
 - sglang/srt/parser/conversation.py +7 -1
 - sglang/srt/parser/reasoning_parser.py +28 -1
 - sglang/srt/sampling/custom_logit_processor.py +67 -1
 - sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
 - sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
 - sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
 - sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
 - sglang/srt/server_args.py +459 -199
 - sglang/srt/single_batch_overlap.py +2 -4
 - sglang/srt/speculative/draft_utils.py +16 -0
 - sglang/srt/speculative/eagle_info.py +42 -36
 - sglang/srt/speculative/eagle_info_v2.py +68 -25
 - sglang/srt/speculative/eagle_utils.py +261 -16
 - sglang/srt/speculative/eagle_worker.py +11 -3
 - sglang/srt/speculative/eagle_worker_v2.py +15 -9
 - sglang/srt/speculative/spec_info.py +305 -31
 - sglang/srt/speculative/spec_utils.py +44 -8
 - sglang/srt/tracing/trace.py +121 -12
 - sglang/srt/utils/common.py +142 -74
 - sglang/srt/utils/hf_transformers_utils.py +38 -12
 - sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
 - sglang/test/kits/radix_cache_server_kit.py +50 -0
 - sglang/test/runners.py +31 -7
 - sglang/test/simple_eval_common.py +5 -3
 - sglang/test/simple_eval_humaneval.py +1 -0
 - sglang/test/simple_eval_math.py +1 -0
 - sglang/test/simple_eval_mmlu.py +1 -0
 - sglang/test/simple_eval_mmmu_vlm.py +1 -0
 - sglang/test/test_deterministic.py +235 -12
 - sglang/test/test_deterministic_utils.py +2 -1
 - sglang/test/test_utils.py +7 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
 - sglang/srt/models/vila.py +0 -306
 - /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
 
    
        sglang/srt/entrypoints/engine.py
    CHANGED
    
    | 
         @@ -101,7 +101,7 @@ class Engine(EngineBase): 
     | 
|
| 
       101 
101 
     | 
    
         | 
| 
       102 
102 
     | 
    
         
             
                Note:
         
     | 
| 
       103 
103 
     | 
    
         
             
                1. The HTTP server, Engine, and TokenizerManager all run in the main process.
         
     | 
| 
       104 
     | 
    
         
            -
                2. Inter-process communication  
     | 
| 
      
 104 
     | 
    
         
            +
                2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
         
     | 
| 
       105 
105 
     | 
    
         
             
                """
         
     | 
| 
       106 
106 
     | 
    
         | 
| 
       107 
107 
     | 
    
         
             
                def __init__(self, **kwargs):
         
     | 
| 
         @@ -109,6 +109,8 @@ class Engine(EngineBase): 
     | 
|
| 
       109 
109 
     | 
    
         
             
                    The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
         
     | 
| 
       110 
110 
     | 
    
         
             
                    Please refer to `ServerArgs` for the documentation.
         
     | 
| 
       111 
111 
     | 
    
         
             
                    """
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                    # Parse server_args
         
     | 
| 
       112 
114 
     | 
    
         
             
                    if "server_args" in kwargs:
         
     | 
| 
       113 
115 
     | 
    
         
             
                        # Directly load server_args
         
     | 
| 
       114 
116 
     | 
    
         
             
                        server_args = kwargs["server_args"]
         
     | 
| 
         @@ -118,34 +120,36 @@ class Engine(EngineBase): 
     | 
|
| 
       118 
120 
     | 
    
         
             
                            # Do not print logs by default
         
     | 
| 
       119 
121 
     | 
    
         
             
                            kwargs["log_level"] = "error"
         
     | 
| 
       120 
122 
     | 
    
         
             
                        server_args = ServerArgs(**kwargs)
         
     | 
| 
      
 123 
     | 
    
         
            +
                    self.server_args = server_args
         
     | 
| 
      
 124 
     | 
    
         
            +
                    logger.info(f"{server_args=}")
         
     | 
| 
       121 
125 
     | 
    
         | 
| 
       122 
126 
     | 
    
         
             
                    # Shutdown the subprocesses automatically when the program exits
         
     | 
| 
       123 
127 
     | 
    
         
             
                    atexit.register(self.shutdown)
         
     | 
| 
       124 
128 
     | 
    
         | 
| 
       125 
     | 
    
         
            -
                    # Allocate ports for inter-process communications
         
     | 
| 
       126 
     | 
    
         
            -
                    self.port_args = PortArgs.init_new(server_args)
         
     | 
| 
       127 
     | 
    
         
            -
                    logger.info(f"{server_args=}")
         
     | 
| 
       128 
     | 
    
         
            -
             
     | 
| 
       129 
129 
     | 
    
         
             
                    # Launch subprocesses
         
     | 
| 
       130 
     | 
    
         
            -
                    tokenizer_manager, template_manager, scheduler_info =  
     | 
| 
       131 
     | 
    
         
            -
                        server_args=server_args 
     | 
| 
       132 
     | 
    
         
            -
                        port_args=self.port_args,
         
     | 
| 
      
 130 
     | 
    
         
            +
                    tokenizer_manager, template_manager, scheduler_info, port_args = (
         
     | 
| 
      
 131 
     | 
    
         
            +
                        _launch_subprocesses(server_args=server_args)
         
     | 
| 
       133 
132 
     | 
    
         
             
                    )
         
     | 
| 
       134 
     | 
    
         
            -
                    self.server_args = server_args
         
     | 
| 
       135 
133 
     | 
    
         
             
                    self.tokenizer_manager = tokenizer_manager
         
     | 
| 
       136 
134 
     | 
    
         
             
                    self.template_manager = template_manager
         
     | 
| 
       137 
135 
     | 
    
         
             
                    self.scheduler_info = scheduler_info
         
     | 
| 
      
 136 
     | 
    
         
            +
                    self.port_args = port_args
         
     | 
| 
       138 
137 
     | 
    
         | 
| 
      
 138 
     | 
    
         
            +
                    # Initialize ZMQ sockets
         
     | 
| 
       139 
139 
     | 
    
         
             
                    context = zmq.Context(2)
         
     | 
| 
       140 
140 
     | 
    
         
             
                    self.send_to_rpc = get_zmq_socket(
         
     | 
| 
       141 
141 
     | 
    
         
             
                        context, zmq.DEALER, self.port_args.rpc_ipc_name, True
         
     | 
| 
       142 
142 
     | 
    
         
             
                    )
         
     | 
| 
       143 
143 
     | 
    
         | 
| 
      
 144 
     | 
    
         
            +
                    # Enable tracing
         
     | 
| 
       144 
145 
     | 
    
         
             
                    if server_args.enable_trace:
         
     | 
| 
       145 
     | 
    
         
            -
                        process_tracing_init(server_args. 
     | 
| 
       146 
     | 
    
         
            -
                         
     | 
| 
       147 
     | 
    
         
            -
             
     | 
| 
       148 
     | 
    
         
            -
                             
     | 
| 
      
 146 
     | 
    
         
            +
                        process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
         
     | 
| 
      
 147 
     | 
    
         
            +
                        thread_label = "Tokenizer"
         
     | 
| 
      
 148 
     | 
    
         
            +
                        if server_args.disaggregation_mode == "prefill":
         
     | 
| 
      
 149 
     | 
    
         
            +
                            thread_label = "Prefill Tokenizer"
         
     | 
| 
      
 150 
     | 
    
         
            +
                        elif server_args.disaggregation_mode == "decode":
         
     | 
| 
      
 151 
     | 
    
         
            +
                            thread_label = "Decode Tokenizer"
         
     | 
| 
      
 152 
     | 
    
         
            +
                        trace_set_thread_info(thread_label)
         
     | 
| 
       149 
153 
     | 
    
         | 
| 
       150 
154 
     | 
    
         
             
                    try:
         
     | 
| 
       151 
155 
     | 
    
         
             
                        self.loop = asyncio.get_running_loop()
         
     | 
| 
         @@ -311,6 +315,7 @@ class Engine(EngineBase): 
     | 
|
| 
       311 
315 
     | 
    
         
             
                    image_data: Optional[MultimodalDataInputFormat] = None,
         
     | 
| 
       312 
316 
     | 
    
         
             
                    audio_data: Optional[MultimodalDataInputFormat] = None,
         
     | 
| 
       313 
317 
     | 
    
         
             
                    video_data: Optional[MultimodalDataInputFormat] = None,
         
     | 
| 
      
 318 
     | 
    
         
            +
                    dimensions: Optional[int] = None,
         
     | 
| 
       314 
319 
     | 
    
         
             
                ) -> Dict:
         
     | 
| 
       315 
320 
     | 
    
         
             
                    """
         
     | 
| 
       316 
321 
     | 
    
         
             
                    The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
         
     | 
| 
         @@ -321,6 +326,7 @@ class Engine(EngineBase): 
     | 
|
| 
       321 
326 
     | 
    
         
             
                        image_data=image_data,
         
     | 
| 
       322 
327 
     | 
    
         
             
                        audio_data=audio_data,
         
     | 
| 
       323 
328 
     | 
    
         
             
                        video_data=video_data,
         
     | 
| 
      
 329 
     | 
    
         
            +
                        dimensions=dimensions,
         
     | 
| 
       324 
330 
     | 
    
         
             
                    )
         
     | 
| 
       325 
331 
     | 
    
         
             
                    generator = self.tokenizer_manager.generate_request(obj, None)
         
     | 
| 
       326 
332 
     | 
    
         
             
                    ret = self.loop.run_until_complete(generator.__anext__())
         
     | 
| 
         @@ -332,6 +338,7 @@ class Engine(EngineBase): 
     | 
|
| 
       332 
338 
     | 
    
         
             
                    image_data: Optional[MultimodalDataInputFormat] = None,
         
     | 
| 
       333 
339 
     | 
    
         
             
                    audio_data: Optional[MultimodalDataInputFormat] = None,
         
     | 
| 
       334 
340 
     | 
    
         
             
                    video_data: Optional[MultimodalDataInputFormat] = None,
         
     | 
| 
      
 341 
     | 
    
         
            +
                    dimensions: Optional[int] = None,
         
     | 
| 
       335 
342 
     | 
    
         
             
                ) -> Dict:
         
     | 
| 
       336 
343 
     | 
    
         
             
                    """
         
     | 
| 
       337 
344 
     | 
    
         
             
                    Asynchronous version of encode method.
         
     | 
| 
         @@ -344,6 +351,7 @@ class Engine(EngineBase): 
     | 
|
| 
       344 
351 
     | 
    
         
             
                        image_data=image_data,
         
     | 
| 
       345 
352 
     | 
    
         
             
                        audio_data=audio_data,
         
     | 
| 
       346 
353 
     | 
    
         
             
                        video_data=video_data,
         
     | 
| 
      
 354 
     | 
    
         
            +
                        dimensions=dimensions,
         
     | 
| 
       347 
355 
     | 
    
         
             
                    )
         
     | 
| 
       348 
356 
     | 
    
         
             
                    generator = self.tokenizer_manager.generate_request(obj, None)
         
     | 
| 
       349 
357 
     | 
    
         
             
                    return await generator.__anext__()
         
     | 
| 
         @@ -669,18 +677,21 @@ class Engine(EngineBase): 
     | 
|
| 
       669 
677 
     | 
    
         
             
            def _set_envs_and_config(server_args: ServerArgs):
         
     | 
| 
       670 
678 
     | 
    
         
             
                # Set global environments
         
     | 
| 
       671 
679 
     | 
    
         
             
                os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
         
     | 
| 
       672 
     | 
    
         
            -
                 
     | 
| 
      
 680 
     | 
    
         
            +
                if "NCCL_CUMEM_ENABLE" not in os.environ:
         
     | 
| 
      
 681 
     | 
    
         
            +
                    os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
         
     | 
| 
       673 
682 
     | 
    
         
             
                if not server_args.enable_symm_mem:
         
     | 
| 
       674 
683 
     | 
    
         
             
                    os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
         
     | 
| 
       675 
     | 
    
         
            -
                os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = " 
     | 
| 
      
 684 
     | 
    
         
            +
                os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "8"
         
     | 
| 
       676 
685 
     | 
    
         
             
                os.environ["CUDA_MODULE_LOADING"] = "AUTO"
         
     | 
| 
       677 
     | 
    
         
            -
             
     | 
| 
      
 686 
     | 
    
         
            +
             
     | 
| 
       678 
687 
     | 
    
         
             
                if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
         
     | 
| 
      
 688 
     | 
    
         
            +
                    # flashinfer uses this environment variable for various kernels from MoE to quant kernels
         
     | 
| 
       679 
689 
     | 
    
         
             
                    os.environ["TRTLLM_ENABLE_PDL"] = "1"
         
     | 
| 
       680 
690 
     | 
    
         | 
| 
       681 
691 
     | 
    
         
             
                if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
         
     | 
| 
       682 
692 
     | 
    
         
             
                    # Default to warning level, to avoid too many logs
         
     | 
| 
       683 
693 
     | 
    
         
             
                    os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
         
     | 
| 
      
 694 
     | 
    
         
            +
             
     | 
| 
       684 
695 
     | 
    
         
             
                if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
         
     | 
| 
       685 
696 
     | 
    
         
             
                    # Need to set log to console, otherwise the log level won't take effect
         
     | 
| 
       686 
697 
     | 
    
         
             
                    os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
         
     | 
| 
         @@ -709,7 +720,7 @@ def _set_envs_and_config(server_args: ServerArgs): 
     | 
|
| 
       709 
720 
     | 
    
         
             
                if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
         
     | 
| 
       710 
721 
     | 
    
         
             
                    assert_pkg_version(
         
     | 
| 
       711 
722 
     | 
    
         
             
                        "sgl-kernel",
         
     | 
| 
       712 
     | 
    
         
            -
                        "0.3.16. 
     | 
| 
      
 723 
     | 
    
         
            +
                        "0.3.16.post4",
         
     | 
| 
       713 
724 
     | 
    
         
             
                        "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         
     | 
| 
       714 
725 
     | 
    
         
             
                    )
         
     | 
| 
       715 
726 
     | 
    
         | 
| 
         @@ -840,7 +851,7 @@ def _launch_subprocesses( 
     | 
|
| 
       840 
851 
     | 
    
         | 
| 
       841 
852 
     | 
    
         
             
                    if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
         
     | 
| 
       842 
853 
     | 
    
         
             
                        # When using `Engine` as a Python API, we don't want to block here.
         
     | 
| 
       843 
     | 
    
         
            -
                        return None, None, None
         
     | 
| 
      
 854 
     | 
    
         
            +
                        return None, None, None, port_args
         
     | 
| 
       844 
855 
     | 
    
         | 
| 
       845 
856 
     | 
    
         
             
                    launch_dummy_health_check_server(
         
     | 
| 
       846 
857 
     | 
    
         
             
                        server_args.host, server_args.port, server_args.enable_metrics
         
     | 
| 
         @@ -851,7 +862,7 @@ def _launch_subprocesses( 
     | 
|
| 
       851 
862 
     | 
    
         
             
                        logger.error(
         
     | 
| 
       852 
863 
     | 
    
         
             
                            f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
         
     | 
| 
       853 
864 
     | 
    
         
             
                        )
         
     | 
| 
       854 
     | 
    
         
            -
                    return None, None, None
         
     | 
| 
      
 865 
     | 
    
         
            +
                    return None, None, None, port_args
         
     | 
| 
       855 
866 
     | 
    
         | 
| 
       856 
867 
     | 
    
         
             
                # Launch detokenizer process
         
     | 
| 
       857 
868 
     | 
    
         
             
                detoken_proc = mp.Process(
         
     | 
| 
         @@ -897,4 +908,4 @@ def _launch_subprocesses( 
     | 
|
| 
       897 
908 
     | 
    
         | 
| 
       898 
909 
     | 
    
         
             
                tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
         
     | 
| 
       899 
910 
     | 
    
         | 
| 
       900 
     | 
    
         
            -
                return tokenizer_manager, template_manager, scheduler_info
         
     | 
| 
      
 911 
     | 
    
         
            +
                return tokenizer_manager, template_manager, scheduler_info, port_args
         
     | 
| 
         @@ -999,7 +999,6 @@ def _wait_and_warmup_grpc( 
     | 
|
| 
       999 
999 
     | 
    
         
             
                # Mark health service as SERVING after warmup completes
         
     | 
| 
       1000 
1000 
     | 
    
         
             
                if health_servicer:
         
     | 
| 
       1001 
1001 
     | 
    
         
             
                    health_servicer.set_serving()
         
     | 
| 
       1002 
     | 
    
         
            -
                    logger.info("Health service marked as SERVING")
         
     | 
| 
       1003 
1002 
     | 
    
         | 
| 
       1004 
1003 
     | 
    
         
             
                logger.info("The server is fired up and ready to roll!")
         
     | 
| 
       1005 
1004 
     | 
    
         | 
| 
         @@ -20,7 +20,7 @@ This file implements HTTP APIs for the inference engine via fastapi. 
     | 
|
| 
       20 
20 
     | 
    
         
             
            import asyncio
         
     | 
| 
       21 
21 
     | 
    
         
             
            import dataclasses
         
     | 
| 
       22 
22 
     | 
    
         
             
            import logging
         
     | 
| 
       23 
     | 
    
         
            -
            import multiprocessing 
     | 
| 
      
 23 
     | 
    
         
            +
            import multiprocessing
         
     | 
| 
       24 
24 
     | 
    
         
             
            import os
         
     | 
| 
       25 
25 
     | 
    
         
             
            import tempfile
         
     | 
| 
       26 
26 
     | 
    
         
             
            import threading
         
     | 
| 
         @@ -165,6 +165,7 @@ async def init_multi_tokenizer() -> ServerArgs: 
     | 
|
| 
       165 
165 
     | 
    
         
             
                    server_args.api_key is None
         
     | 
| 
       166 
166 
     | 
    
         
             
                ), "API key is not supported in multi-tokenizer mode"
         
     | 
| 
       167 
167 
     | 
    
         | 
| 
      
 168 
     | 
    
         
            +
                # Create a new ipc name for the current process
         
     | 
| 
       168 
169 
     | 
    
         
             
                port_args.tokenizer_ipc_name = (
         
     | 
| 
       169 
170 
     | 
    
         
             
                    f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
         
     | 
| 
       170 
171 
     | 
    
         
             
                )
         
     | 
| 
         @@ -184,6 +185,7 @@ async def init_multi_tokenizer() -> ServerArgs: 
     | 
|
| 
       184 
185 
     | 
    
         
             
                )
         
     | 
| 
       185 
186 
     | 
    
         | 
| 
       186 
187 
     | 
    
         
             
                tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
         
     | 
| 
      
 188 
     | 
    
         
            +
             
     | 
| 
       187 
189 
     | 
    
         
             
                set_global_state(
         
     | 
| 
       188 
190 
     | 
    
         
             
                    _GlobalState(
         
     | 
| 
       189 
191 
     | 
    
         
             
                        tokenizer_manager=tokenizer_manager,
         
     | 
| 
         @@ -192,36 +194,38 @@ async def init_multi_tokenizer() -> ServerArgs: 
     | 
|
| 
       192 
194 
     | 
    
         
             
                    )
         
     | 
| 
       193 
195 
     | 
    
         
             
                )
         
     | 
| 
       194 
196 
     | 
    
         | 
| 
       195 
     | 
    
         
            -
                if server_args.enable_trace:
         
     | 
| 
       196 
     | 
    
         
            -
                    process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
         
     | 
| 
       197 
     | 
    
         
            -
                    if server_args.disaggregation_mode == "null":
         
     | 
| 
       198 
     | 
    
         
            -
                        thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
         
     | 
| 
       199 
     | 
    
         
            -
                        trace_set_thread_info(thread_label)
         
     | 
| 
       200 
     | 
    
         
            -
             
     | 
| 
       201 
197 
     | 
    
         
             
                return server_args
         
     | 
| 
       202 
198 
     | 
    
         | 
| 
       203 
199 
     | 
    
         | 
| 
       204 
200 
     | 
    
         
             
            @asynccontextmanager
         
     | 
| 
       205 
201 
     | 
    
         
             
            async def lifespan(fast_api_app: FastAPI):
         
     | 
| 
       206 
     | 
    
         
            -
                if  
     | 
| 
      
 202 
     | 
    
         
            +
                if getattr(fast_api_app, "is_single_tokenizer_mode", False):
         
     | 
| 
      
 203 
     | 
    
         
            +
                    server_args = fast_api_app.server_args
         
     | 
| 
      
 204 
     | 
    
         
            +
                    warmup_thread_args = fast_api_app.warmup_thread_args
         
     | 
| 
      
 205 
     | 
    
         
            +
                    thread_label = "Tokenizer"
         
     | 
| 
      
 206 
     | 
    
         
            +
                else:
         
     | 
| 
       207 
207 
     | 
    
         
             
                    # Initialize multi-tokenizer support for worker processes
         
     | 
| 
       208 
     | 
    
         
            -
                     
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
             
     | 
| 
       211 
     | 
    
         
            -
             
     | 
| 
       212 
     | 
    
         
            -
             
     | 
| 
       213 
     | 
    
         
            -
                        add_prometheus_middleware(app)
         
     | 
| 
       214 
     | 
    
         
            -
                        enable_func_timer()
         
     | 
| 
       215 
     | 
    
         
            -
             
     | 
| 
       216 
     | 
    
         
            -
                    logger.info(f"Worker {worker_pid} added prometheus middleware")
         
     | 
| 
       217 
     | 
    
         
            -
                    fast_api_app.warmup_thread = threading.Thread(
         
     | 
| 
       218 
     | 
    
         
            -
                        target=_wait_and_warmup,
         
     | 
| 
       219 
     | 
    
         
            -
                        args=(
         
     | 
| 
       220 
     | 
    
         
            -
                            fast_api_app.server_args,
         
     | 
| 
       221 
     | 
    
         
            -
                            None,  # pipe_finish_writer not needed in worker
         
     | 
| 
       222 
     | 
    
         
            -
                            None,  # launch_callback not needed in worker
         
     | 
| 
       223 
     | 
    
         
            -
                        ),
         
     | 
| 
      
 208 
     | 
    
         
            +
                    server_args = await init_multi_tokenizer()
         
     | 
| 
      
 209 
     | 
    
         
            +
                    warmup_thread_args = (
         
     | 
| 
      
 210 
     | 
    
         
            +
                        server_args,
         
     | 
| 
      
 211 
     | 
    
         
            +
                        None,
         
     | 
| 
      
 212 
     | 
    
         
            +
                        None,
         
     | 
| 
       224 
213 
     | 
    
         
             
                    )
         
     | 
| 
      
 214 
     | 
    
         
            +
                    thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}"
         
     | 
| 
      
 215 
     | 
    
         
            +
             
     | 
| 
      
 216 
     | 
    
         
            +
                # Add prometheus middleware
         
     | 
| 
      
 217 
     | 
    
         
            +
                if server_args.enable_metrics:
         
     | 
| 
      
 218 
     | 
    
         
            +
                    add_prometheus_middleware(app)
         
     | 
| 
      
 219 
     | 
    
         
            +
                    enable_func_timer()
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
                # Init tracing
         
     | 
| 
      
 222 
     | 
    
         
            +
                if server_args.enable_trace:
         
     | 
| 
      
 223 
     | 
    
         
            +
                    process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
         
     | 
| 
      
 224 
     | 
    
         
            +
                    if server_args.disaggregation_mode == "prefill":
         
     | 
| 
      
 225 
     | 
    
         
            +
                        thread_label = "Prefill" + thread_label
         
     | 
| 
      
 226 
     | 
    
         
            +
                    elif server_args.disaggregation_mode == "decode":
         
     | 
| 
      
 227 
     | 
    
         
            +
                        thread_label = "Decode" + thread_label
         
     | 
| 
      
 228 
     | 
    
         
            +
                    trace_set_thread_info(thread_label)
         
     | 
| 
       225 
229 
     | 
    
         | 
| 
       226 
230 
     | 
    
         
             
                # Initialize OpenAI serving handlers
         
     | 
| 
       227 
231 
     | 
    
         
             
                fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
         
     | 
| 
         @@ -249,8 +253,7 @@ async def lifespan(fast_api_app: FastAPI): 
     | 
|
| 
       249 
253 
     | 
    
         
             
                    _global_state.tokenizer_manager
         
     | 
| 
       250 
254 
     | 
    
         
             
                )
         
     | 
| 
       251 
255 
     | 
    
         | 
| 
       252 
     | 
    
         
            -
                 
     | 
| 
       253 
     | 
    
         
            -
             
     | 
| 
      
 256 
     | 
    
         
            +
                # Launch tool server
         
     | 
| 
       254 
257 
     | 
    
         
             
                tool_server = None
         
     | 
| 
       255 
258 
     | 
    
         
             
                if server_args.tool_server == "demo":
         
     | 
| 
       256 
259 
     | 
    
         
             
                    from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
         
     | 
| 
         @@ -274,12 +277,11 @@ async def lifespan(fast_api_app: FastAPI): 
     | 
|
| 
       274 
277 
     | 
    
         
             
                        enable_force_include_usage=True,
         
     | 
| 
       275 
278 
     | 
    
         
             
                        tool_server=tool_server,
         
     | 
| 
       276 
279 
     | 
    
         
             
                    )
         
     | 
| 
       277 
     | 
    
         
            -
                except Exception 
     | 
| 
       278 
     | 
    
         
            -
                     
     | 
| 
       279 
     | 
    
         
            -
             
     | 
| 
       280 
     | 
    
         
            -
                    traceback.print_exc()
         
     | 
| 
       281 
     | 
    
         
            -
                    logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
         
     | 
| 
      
 280 
     | 
    
         
            +
                except Exception:
         
     | 
| 
      
 281 
     | 
    
         
            +
                    traceback = get_exception_traceback()
         
     | 
| 
      
 282 
     | 
    
         
            +
                    logger.warning(f"Can not initialize OpenAIServingResponses, error: {traceback}")
         
     | 
| 
       282 
283 
     | 
    
         | 
| 
      
 284 
     | 
    
         
            +
                # Execute custom warmups
         
     | 
| 
       283 
285 
     | 
    
         
             
                if server_args.warmups is not None:
         
     | 
| 
       284 
286 
     | 
    
         
             
                    await execute_warmups(
         
     | 
| 
       285 
287 
     | 
    
         
             
                        server_args.disaggregation_mode,
         
     | 
| 
         @@ -288,18 +290,18 @@ async def lifespan(fast_api_app: FastAPI): 
     | 
|
| 
       288 
290 
     | 
    
         
             
                    )
         
     | 
| 
       289 
291 
     | 
    
         
             
                    logger.info("Warmup ended")
         
     | 
| 
       290 
292 
     | 
    
         | 
| 
       291 
     | 
    
         
            -
                 
     | 
| 
       292 
     | 
    
         
            -
                 
     | 
| 
       293 
     | 
    
         
            -
                     
     | 
| 
      
 293 
     | 
    
         
            +
                # Execute the general warmup
         
     | 
| 
      
 294 
     | 
    
         
            +
                warmup_thread = threading.Thread(
         
     | 
| 
      
 295 
     | 
    
         
            +
                    target=_wait_and_warmup,
         
     | 
| 
      
 296 
     | 
    
         
            +
                    args=warmup_thread_args,
         
     | 
| 
      
 297 
     | 
    
         
            +
                )
         
     | 
| 
      
 298 
     | 
    
         
            +
                warmup_thread.start()
         
     | 
| 
       294 
299 
     | 
    
         | 
| 
      
 300 
     | 
    
         
            +
                # Start the HTTP server
         
     | 
| 
       295 
301 
     | 
    
         
             
                try:
         
     | 
| 
       296 
302 
     | 
    
         
             
                    yield
         
     | 
| 
       297 
303 
     | 
    
         
             
                finally:
         
     | 
| 
       298 
     | 
    
         
            -
                     
     | 
| 
       299 
     | 
    
         
            -
                        pid = os.getpid()
         
     | 
| 
       300 
     | 
    
         
            -
                        logger.info(f"uvicorn worker {pid} ending...")
         
     | 
| 
       301 
     | 
    
         
            -
                        warmup_thread.join()
         
     | 
| 
       302 
     | 
    
         
            -
                        logger.info(f"uvicorn worker {pid} ended.")
         
     | 
| 
      
 304 
     | 
    
         
            +
                    warmup_thread.join()
         
     | 
| 
       303 
305 
     | 
    
         | 
| 
       304 
306 
     | 
    
         | 
| 
       305 
307 
     | 
    
         
             
            # Fast API
         
     | 
| 
         @@ -499,6 +501,11 @@ async def get_server_info(): 
     | 
|
| 
       499 
501 
     | 
    
         
             
                internal_states: List[Dict[Any, Any]] = (
         
     | 
| 
       500 
502 
     | 
    
         
             
                    await _global_state.tokenizer_manager.get_internal_state()
         
     | 
| 
       501 
503 
     | 
    
         
             
                )
         
     | 
| 
      
 504 
     | 
    
         
            +
             
     | 
| 
      
 505 
     | 
    
         
            +
                # This field is not serializable.
         
     | 
| 
      
 506 
     | 
    
         
            +
                if hasattr(_global_state.tokenizer_manager.server_args, "model_config"):
         
     | 
| 
      
 507 
     | 
    
         
            +
                    del _global_state.tokenizer_manager.server_args.model_config
         
     | 
| 
      
 508 
     | 
    
         
            +
             
     | 
| 
       502 
509 
     | 
    
         
             
                return {
         
     | 
| 
       503 
510 
     | 
    
         
             
                    **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
         
     | 
| 
       504 
511 
     | 
    
         
             
                    **_global_state.scheduler_info,
         
     | 
| 
         @@ -1164,6 +1171,8 @@ async def available_models(): 
     | 
|
| 
       1164 
1171 
     | 
    
         
             
                """Show available models. OpenAI-compatible endpoint."""
         
     | 
| 
       1165 
1172 
     | 
    
         
             
                served_model_names = [_global_state.tokenizer_manager.served_model_name]
         
     | 
| 
       1166 
1173 
     | 
    
         
             
                model_cards = []
         
     | 
| 
      
 1174 
     | 
    
         
            +
             
     | 
| 
      
 1175 
     | 
    
         
            +
                # Add base model
         
     | 
| 
       1167 
1176 
     | 
    
         
             
                for served_model_name in served_model_names:
         
     | 
| 
       1168 
1177 
     | 
    
         
             
                    model_cards.append(
         
     | 
| 
       1169 
1178 
     | 
    
         
             
                        ModelCard(
         
     | 
| 
         @@ -1172,6 +1181,20 @@ async def available_models(): 
     | 
|
| 
       1172 
1181 
     | 
    
         
             
                            max_model_len=_global_state.tokenizer_manager.model_config.context_len,
         
     | 
| 
       1173 
1182 
     | 
    
         
             
                        )
         
     | 
| 
       1174 
1183 
     | 
    
         
             
                    )
         
     | 
| 
      
 1184 
     | 
    
         
            +
             
     | 
| 
      
 1185 
     | 
    
         
            +
                # Add loaded LoRA adapters
         
     | 
| 
      
 1186 
     | 
    
         
            +
                if _global_state.tokenizer_manager.server_args.enable_lora:
         
     | 
| 
      
 1187 
     | 
    
         
            +
                    lora_registry = _global_state.tokenizer_manager.lora_registry
         
     | 
| 
      
 1188 
     | 
    
         
            +
                    for _, lora_ref in lora_registry.get_all_adapters().items():
         
     | 
| 
      
 1189 
     | 
    
         
            +
                        model_cards.append(
         
     | 
| 
      
 1190 
     | 
    
         
            +
                            ModelCard(
         
     | 
| 
      
 1191 
     | 
    
         
            +
                                id=lora_ref.lora_name,
         
     | 
| 
      
 1192 
     | 
    
         
            +
                                root=lora_ref.lora_path,
         
     | 
| 
      
 1193 
     | 
    
         
            +
                                parent=served_model_names[0],
         
     | 
| 
      
 1194 
     | 
    
         
            +
                                max_model_len=None,
         
     | 
| 
      
 1195 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1196 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1197 
     | 
    
         
            +
             
     | 
| 
       1175 
1198 
     | 
    
         
             
                return ModelList(data=model_cards)
         
     | 
| 
       1176 
1199 
     | 
    
         | 
| 
       1177 
1200 
     | 
    
         | 
| 
         @@ -1328,27 +1351,12 @@ def launch_server( 
     | 
|
| 
       1328 
1351 
     | 
    
         
             
                    3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
         
     | 
| 
       1329 
1352 
     | 
    
         | 
| 
       1330 
1353 
     | 
    
         
             
                Note:
         
     | 
| 
       1331 
     | 
    
         
            -
                1. The HTTP server, Engine, and TokenizerManager  
     | 
| 
      
 1354 
     | 
    
         
            +
                1. The HTTP server, Engine, and TokenizerManager all run in the main process.
         
     | 
| 
       1332 
1355 
     | 
    
         
             
                2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
         
     | 
| 
       1333 
1356 
     | 
    
         
             
                """
         
     | 
| 
       1334 
     | 
    
         
            -
                 
     | 
| 
       1335 
     | 
    
         
            -
                     
     | 
| 
       1336 
     | 
    
         
            -
             
     | 
| 
       1337 
     | 
    
         
            -
                        f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
         
     | 
| 
       1338 
     | 
    
         
            -
                    )
         
     | 
| 
       1339 
     | 
    
         
            -
                    tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
         
     | 
| 
       1340 
     | 
    
         
            -
                        server_args=server_args, port_args=port_args
         
     | 
| 
       1341 
     | 
    
         
            -
                    )
         
     | 
| 
       1342 
     | 
    
         
            -
                else:
         
     | 
| 
       1343 
     | 
    
         
            -
                    tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
         
     | 
| 
       1344 
     | 
    
         
            -
                        server_args=server_args,
         
     | 
| 
       1345 
     | 
    
         
            -
                    )
         
     | 
| 
       1346 
     | 
    
         
            -
             
     | 
| 
       1347 
     | 
    
         
            -
                    if server_args.enable_trace:
         
     | 
| 
       1348 
     | 
    
         
            -
                        process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
         
     | 
| 
       1349 
     | 
    
         
            -
                        if server_args.disaggregation_mode == "null":
         
     | 
| 
       1350 
     | 
    
         
            -
                            thread_label = "Tokenizer"
         
     | 
| 
       1351 
     | 
    
         
            -
                            trace_set_thread_info(thread_label)
         
     | 
| 
      
 1357 
     | 
    
         
            +
                tokenizer_manager, template_manager, scheduler_info, port_args = (
         
     | 
| 
      
 1358 
     | 
    
         
            +
                    _launch_subprocesses(server_args=server_args)
         
     | 
| 
      
 1359 
     | 
    
         
            +
                )
         
     | 
| 
       1352 
1360 
     | 
    
         | 
| 
       1353 
1361 
     | 
    
         
             
                set_global_state(
         
     | 
| 
       1354 
1362 
     | 
    
         
             
                    _GlobalState(
         
     | 
| 
         @@ -1358,40 +1366,45 @@ def launch_server( 
     | 
|
| 
       1358 
1366 
     | 
    
         
             
                    )
         
     | 
| 
       1359 
1367 
     | 
    
         
             
                )
         
     | 
| 
       1360 
1368 
     | 
    
         | 
| 
       1361 
     | 
    
         
            -
                 
     | 
| 
       1362 
     | 
    
         
            -
             
     | 
| 
       1363 
     | 
    
         
            -
             
     | 
| 
      
 1369 
     | 
    
         
            +
                # Pass additional arguments to the lifespan function.
         
     | 
| 
      
 1370 
     | 
    
         
            +
                # They will be used for additional initialization setups.
         
     | 
| 
      
 1371 
     | 
    
         
            +
                if server_args.tokenizer_worker_num == 1:
         
     | 
| 
      
 1372 
     | 
    
         
            +
                    # If it is single tokenizer mode, we can pass the arguments by attributes of the app object.
         
     | 
| 
      
 1373 
     | 
    
         
            +
                    app.is_single_tokenizer_mode = True
         
     | 
| 
      
 1374 
     | 
    
         
            +
                    app.server_args = server_args
         
     | 
| 
      
 1375 
     | 
    
         
            +
                    app.warmup_thread_args = (
         
     | 
| 
       1364 
1376 
     | 
    
         
             
                        server_args,
         
     | 
| 
       1365 
     | 
    
         
            -
                         
     | 
| 
      
 1377 
     | 
    
         
            +
                        pipe_finish_writer,
         
     | 
| 
      
 1378 
     | 
    
         
            +
                        launch_callback,
         
     | 
| 
       1366 
1379 
     | 
    
         
             
                    )
         
     | 
| 
       1367 
     | 
    
         
            -
             
     | 
| 
      
 1380 
     | 
    
         
            +
             
     | 
| 
       1368 
1381 
     | 
    
         
             
                    # Add api key authorization
         
     | 
| 
      
 1382 
     | 
    
         
            +
                    # This is only supported in single tokenizer mode.
         
     | 
| 
       1369 
1383 
     | 
    
         
             
                    if server_args.api_key:
         
     | 
| 
       1370 
1384 
     | 
    
         
             
                        add_api_key_middleware(app, server_args.api_key)
         
     | 
| 
       1371 
     | 
    
         
            -
             
     | 
| 
       1372 
     | 
    
         
            -
                    #  
     | 
| 
       1373 
     | 
    
         
            -
                     
     | 
| 
       1374 
     | 
    
         
            -
             
     | 
| 
       1375 
     | 
    
         
            -
             
     | 
| 
       1376 
     | 
    
         
            -
             
     | 
| 
       1377 
     | 
    
         
            -
                    # Send a warmup request - we will create the thread launch it
         
     | 
| 
       1378 
     | 
    
         
            -
                    # in the lifespan after all other warmups have fired.
         
     | 
| 
       1379 
     | 
    
         
            -
                    warmup_thread = threading.Thread(
         
     | 
| 
       1380 
     | 
    
         
            -
                        target=_wait_and_warmup,
         
     | 
| 
       1381 
     | 
    
         
            -
                        args=(
         
     | 
| 
       1382 
     | 
    
         
            -
                            server_args,
         
     | 
| 
       1383 
     | 
    
         
            -
                            pipe_finish_writer,
         
     | 
| 
       1384 
     | 
    
         
            -
                            launch_callback,
         
     | 
| 
       1385 
     | 
    
         
            -
                        ),
         
     | 
| 
      
 1385 
     | 
    
         
            +
                else:
         
     | 
| 
      
 1386 
     | 
    
         
            +
                    # If it is multi-tokenizer mode, we need to write the arguments to shared memory
         
     | 
| 
      
 1387 
     | 
    
         
            +
                    # for other worker processes to read.
         
     | 
| 
      
 1388 
     | 
    
         
            +
                    app.is_single_tokenizer_mode = False
         
     | 
| 
      
 1389 
     | 
    
         
            +
                    multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
         
     | 
| 
      
 1390 
     | 
    
         
            +
                        port_args, server_args, scheduler_info
         
     | 
| 
       1386 
1391 
     | 
    
         
             
                    )
         
     | 
| 
       1387 
     | 
    
         
            -
                    app.warmup_thread = warmup_thread
         
     | 
| 
       1388 
1392 
     | 
    
         | 
| 
       1389 
1393 
     | 
    
         
             
                try:
         
     | 
| 
       1390 
1394 
     | 
    
         
             
                    # Update logging configs
         
     | 
| 
       1391 
1395 
     | 
    
         
             
                    set_uvicorn_logging_configs()
         
     | 
| 
       1392 
     | 
    
         
            -
             
     | 
| 
      
 1396 
     | 
    
         
            +
             
     | 
| 
       1393 
1397 
     | 
    
         
             
                    # Listen for HTTP requests
         
     | 
| 
       1394 
     | 
    
         
            -
                    if server_args.tokenizer_worker_num  
     | 
| 
      
 1398 
     | 
    
         
            +
                    if server_args.tokenizer_worker_num == 1:
         
     | 
| 
      
 1399 
     | 
    
         
            +
                        uvicorn.run(
         
     | 
| 
      
 1400 
     | 
    
         
            +
                            app,
         
     | 
| 
      
 1401 
     | 
    
         
            +
                            host=server_args.host,
         
     | 
| 
      
 1402 
     | 
    
         
            +
                            port=server_args.port,
         
     | 
| 
      
 1403 
     | 
    
         
            +
                            log_level=server_args.log_level_http or server_args.log_level,
         
     | 
| 
      
 1404 
     | 
    
         
            +
                            timeout_keep_alive=5,
         
     | 
| 
      
 1405 
     | 
    
         
            +
                            loop="uvloop",
         
     | 
| 
      
 1406 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1407 
     | 
    
         
            +
                    else:
         
     | 
| 
       1395 
1408 
     | 
    
         
             
                        from uvicorn.config import LOGGING_CONFIG
         
     | 
| 
       1396 
1409 
     | 
    
         | 
| 
       1397 
1410 
     | 
    
         
             
                        LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
         
     | 
| 
         @@ -1399,7 +1412,6 @@ def launch_server( 
     | 
|
| 
       1399 
1412 
     | 
    
         
             
                            "level": "INFO",
         
     | 
| 
       1400 
1413 
     | 
    
         
             
                            "propagate": False,
         
     | 
| 
       1401 
1414 
     | 
    
         
             
                        }
         
     | 
| 
       1402 
     | 
    
         
            -
             
     | 
| 
       1403 
1415 
     | 
    
         
             
                        monkey_patch_uvicorn_multiprocessing()
         
     | 
| 
       1404 
1416 
     | 
    
         | 
| 
       1405 
1417 
     | 
    
         
             
                        uvicorn.run(
         
     | 
| 
         @@ -1411,22 +1423,10 @@ def launch_server( 
     | 
|
| 
       1411 
1423 
     | 
    
         
             
                            loop="uvloop",
         
     | 
| 
       1412 
1424 
     | 
    
         
             
                            workers=server_args.tokenizer_worker_num,
         
     | 
| 
       1413 
1425 
     | 
    
         
             
                        )
         
     | 
| 
       1414 
     | 
    
         
            -
                    else:
         
     | 
| 
       1415 
     | 
    
         
            -
                        app.is_single_tokenizer_mode = True
         
     | 
| 
       1416 
     | 
    
         
            -
                        uvicorn.run(
         
     | 
| 
       1417 
     | 
    
         
            -
                            app,
         
     | 
| 
       1418 
     | 
    
         
            -
                            host=server_args.host,
         
     | 
| 
       1419 
     | 
    
         
            -
                            port=server_args.port,
         
     | 
| 
       1420 
     | 
    
         
            -
                            log_level=server_args.log_level_http or server_args.log_level,
         
     | 
| 
       1421 
     | 
    
         
            -
                            timeout_keep_alive=5,
         
     | 
| 
       1422 
     | 
    
         
            -
                            loop="uvloop",
         
     | 
| 
       1423 
     | 
    
         
            -
                        )
         
     | 
| 
       1424 
1426 
     | 
    
         
             
                finally:
         
     | 
| 
       1425 
1427 
     | 
    
         
             
                    if server_args.tokenizer_worker_num > 1:
         
     | 
| 
       1426 
1428 
     | 
    
         
             
                        multi_tokenizer_args_shm.unlink()
         
     | 
| 
       1427 
1429 
     | 
    
         
             
                        _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
         
     | 
| 
       1428 
     | 
    
         
            -
                    else:
         
     | 
| 
       1429 
     | 
    
         
            -
                        warmup_thread.join()
         
     | 
| 
       1430 
1430 
     | 
    
         | 
| 
       1431 
1431 
     | 
    
         | 
| 
       1432 
1432 
     | 
    
         
             
            def _execute_server_warmup(
         
     | 
| 
         @@ -37,7 +37,11 @@ from pydantic import ( 
     | 
|
| 
       37 
37 
     | 
    
         
             
                model_validator,
         
     | 
| 
       38 
38 
     | 
    
         
             
            )
         
     | 
| 
       39 
39 
     | 
    
         
             
            from typing_extensions import Literal
         
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
            try:
         
     | 
| 
      
 42 
     | 
    
         
            +
                from xgrammar import StructuralTag
         
     | 
| 
      
 43 
     | 
    
         
            +
            except:
         
     | 
| 
      
 44 
     | 
    
         
            +
                StructuralTag = Any
         
     | 
| 
       41 
45 
     | 
    
         | 
| 
       42 
46 
     | 
    
         
             
            from sglang.utils import convert_json_schema_to_str
         
     | 
| 
       43 
47 
     | 
    
         | 
| 
         @@ -54,6 +58,7 @@ class ModelCard(BaseModel): 
     | 
|
| 
       54 
58 
     | 
    
         
             
                created: int = Field(default_factory=lambda: int(time.time()))
         
     | 
| 
       55 
59 
     | 
    
         
             
                owned_by: str = "sglang"
         
     | 
| 
       56 
60 
     | 
    
         
             
                root: Optional[str] = None
         
     | 
| 
      
 61 
     | 
    
         
            +
                parent: Optional[str] = None
         
     | 
| 
       57 
62 
     | 
    
         
             
                max_model_len: Optional[int] = None
         
     | 
| 
       58 
63 
     | 
    
         | 
| 
       59 
64 
     | 
    
         | 
| 
         @@ -108,6 +113,7 @@ class UsageInfo(BaseModel): 
     | 
|
| 
       108 
113 
     | 
    
         | 
| 
       109 
114 
     | 
    
         
             
            class StreamOptions(BaseModel):
         
     | 
| 
       110 
115 
     | 
    
         
             
                include_usage: Optional[bool] = False
         
     | 
| 
      
 116 
     | 
    
         
            +
                continuous_usage_stats: Optional[bool] = False
         
     | 
| 
       111 
117 
     | 
    
         | 
| 
       112 
118 
     | 
    
         | 
| 
       113 
119 
     | 
    
         
             
            class JsonSchemaResponseFormat(BaseModel):
         
     | 
| 
         @@ -535,6 +535,17 @@ class OpenAIServingChat(OpenAIServingBase): 
     | 
|
| 
       535 
535 
     | 
    
         
             
                                        choices=[choice_data],
         
     | 
| 
       536 
536 
     | 
    
         
             
                                        model=request.model,
         
     | 
| 
       537 
537 
     | 
    
         
             
                                    )
         
     | 
| 
      
 538 
     | 
    
         
            +
             
     | 
| 
      
 539 
     | 
    
         
            +
                                    # Add usage stats if continuous_usage_stats is enabled
         
     | 
| 
      
 540 
     | 
    
         
            +
                                    if (
         
     | 
| 
      
 541 
     | 
    
         
            +
                                        request.stream_options
         
     | 
| 
      
 542 
     | 
    
         
            +
                                        and request.stream_options.continuous_usage_stats
         
     | 
| 
      
 543 
     | 
    
         
            +
                                    ):
         
     | 
| 
      
 544 
     | 
    
         
            +
                                        chunk.usage = UsageProcessor.calculate_token_usage(
         
     | 
| 
      
 545 
     | 
    
         
            +
                                            prompt_tokens=prompt_tokens.get(index, 0),
         
     | 
| 
      
 546 
     | 
    
         
            +
                                            completion_tokens=completion_tokens.get(index, 0),
         
     | 
| 
      
 547 
     | 
    
         
            +
                                        )
         
     | 
| 
      
 548 
     | 
    
         
            +
             
     | 
| 
       538 
549 
     | 
    
         
             
                                    yield f"data: {chunk.model_dump_json()}\n\n"
         
     | 
| 
       539 
550 
     | 
    
         | 
| 
       540 
551 
     | 
    
         
             
                            # Handle tool calls
         
     | 
| 
         @@ -579,6 +590,17 @@ class OpenAIServingChat(OpenAIServingBase): 
     | 
|
| 
       579 
590 
     | 
    
         
             
                                        choices=[choice_data],
         
     | 
| 
       580 
591 
     | 
    
         
             
                                        model=request.model,
         
     | 
| 
       581 
592 
     | 
    
         
             
                                    )
         
     | 
| 
      
 593 
     | 
    
         
            +
             
     | 
| 
      
 594 
     | 
    
         
            +
                                    # Add usage stats if continuous_usage_stats is enabled
         
     | 
| 
      
 595 
     | 
    
         
            +
                                    if (
         
     | 
| 
      
 596 
     | 
    
         
            +
                                        request.stream_options
         
     | 
| 
      
 597 
     | 
    
         
            +
                                        and request.stream_options.continuous_usage_stats
         
     | 
| 
      
 598 
     | 
    
         
            +
                                    ):
         
     | 
| 
      
 599 
     | 
    
         
            +
                                        chunk.usage = UsageProcessor.calculate_token_usage(
         
     | 
| 
      
 600 
     | 
    
         
            +
                                            prompt_tokens=prompt_tokens.get(index, 0),
         
     | 
| 
      
 601 
     | 
    
         
            +
                                            completion_tokens=completion_tokens.get(index, 0),
         
     | 
| 
      
 602 
     | 
    
         
            +
                                        )
         
     | 
| 
      
 603 
     | 
    
         
            +
             
     | 
| 
       582 
604 
     | 
    
         
             
                                    yield f"data: {chunk.model_dump_json()}\n\n"
         
     | 
| 
       583 
605 
     | 
    
         | 
| 
       584 
606 
     | 
    
         
             
                        # Send finish_reason chunks for each index that completed
         
     | 
| 
         @@ -1056,6 +1078,16 @@ class OpenAIServingChat(OpenAIServingBase): 
     | 
|
| 
       1056 
1078 
     | 
    
         
             
                            choices=[choice_data],
         
     | 
| 
       1057 
1079 
     | 
    
         
             
                            model=request.model,
         
     | 
| 
       1058 
1080 
     | 
    
         
             
                        )
         
     | 
| 
      
 1081 
     | 
    
         
            +
             
     | 
| 
      
 1082 
     | 
    
         
            +
                        # Add usage stats if continuous_usage_stats is enabled
         
     | 
| 
      
 1083 
     | 
    
         
            +
                        if request.stream_options and request.stream_options.continuous_usage_stats:
         
     | 
| 
      
 1084 
     | 
    
         
            +
                            prompt_tokens = content["meta_info"].get("prompt_tokens", 0)
         
     | 
| 
      
 1085 
     | 
    
         
            +
                            completion_tokens = content["meta_info"].get("completion_tokens", 0)
         
     | 
| 
      
 1086 
     | 
    
         
            +
                            chunk.usage = UsageProcessor.calculate_token_usage(
         
     | 
| 
      
 1087 
     | 
    
         
            +
                                prompt_tokens=prompt_tokens,
         
     | 
| 
      
 1088 
     | 
    
         
            +
                                completion_tokens=completion_tokens,
         
     | 
| 
      
 1089 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1090 
     | 
    
         
            +
             
     | 
| 
       1059 
1091 
     | 
    
         
             
                        yield f"data: {chunk.model_dump_json()}\n\n"
         
     | 
| 
       1060 
1092 
     | 
    
         | 
| 
       1061 
1093 
     | 
    
         
             
                    # Yield tool calls
         
     | 
| 
         @@ -1096,6 +1128,16 @@ class OpenAIServingChat(OpenAIServingBase): 
     | 
|
| 
       1096 
1128 
     | 
    
         
             
                            choices=[choice_data],
         
     | 
| 
       1097 
1129 
     | 
    
         
             
                            model=request.model,
         
     | 
| 
       1098 
1130 
     | 
    
         
             
                        )
         
     | 
| 
      
 1131 
     | 
    
         
            +
             
     | 
| 
      
 1132 
     | 
    
         
            +
                        # Add usage stats if continuous_usage_stats is enabled
         
     | 
| 
      
 1133 
     | 
    
         
            +
                        if request.stream_options and request.stream_options.continuous_usage_stats:
         
     | 
| 
      
 1134 
     | 
    
         
            +
                            prompt_tokens = content["meta_info"].get("prompt_tokens", 0)
         
     | 
| 
      
 1135 
     | 
    
         
            +
                            completion_tokens = content["meta_info"].get("completion_tokens", 0)
         
     | 
| 
      
 1136 
     | 
    
         
            +
                            chunk.usage = UsageProcessor.calculate_token_usage(
         
     | 
| 
      
 1137 
     | 
    
         
            +
                                prompt_tokens=prompt_tokens,
         
     | 
| 
      
 1138 
     | 
    
         
            +
                                completion_tokens=completion_tokens,
         
     | 
| 
      
 1139 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1140 
     | 
    
         
            +
             
     | 
| 
       1099 
1141 
     | 
    
         
             
                        yield f"data: {chunk.model_dump_json()}\n\n"
         
     | 
| 
       1100 
1142 
     | 
    
         | 
| 
       1101 
1143 
     | 
    
         
             
                def _check_for_unstreamed_tool_args(
         
     | 
| 
         @@ -272,6 +272,16 @@ class OpenAIServingCompletion(OpenAIServingBase): 
     | 
|
| 
       272 
272 
     | 
    
         
             
                                model=request.model,
         
     | 
| 
       273 
273 
     | 
    
         
             
                            )
         
     | 
| 
       274 
274 
     | 
    
         | 
| 
      
 275 
     | 
    
         
            +
                            # Add usage stats if continuous_usage_stats is enabled
         
     | 
| 
      
 276 
     | 
    
         
            +
                            if (
         
     | 
| 
      
 277 
     | 
    
         
            +
                                request.stream_options
         
     | 
| 
      
 278 
     | 
    
         
            +
                                and request.stream_options.continuous_usage_stats
         
     | 
| 
      
 279 
     | 
    
         
            +
                            ):
         
     | 
| 
      
 280 
     | 
    
         
            +
                                chunk.usage = UsageProcessor.calculate_token_usage(
         
     | 
| 
      
 281 
     | 
    
         
            +
                                    prompt_tokens=prompt_tokens.get(index, 0),
         
     | 
| 
      
 282 
     | 
    
         
            +
                                    completion_tokens=completion_tokens.get(index, 0),
         
     | 
| 
      
 283 
     | 
    
         
            +
                                )
         
     | 
| 
      
 284 
     | 
    
         
            +
             
     | 
| 
       275 
285 
     | 
    
         
             
                            yield f"data: {chunk.model_dump_json()}\n\n"
         
     | 
| 
       276 
286 
     | 
    
         | 
| 
       277 
287 
     | 
    
         
             
                        if request.return_hidden_states and hidden_states:
         
     | 
    
        sglang/srt/environ.py
    CHANGED
    
    | 
         @@ -111,25 +111,31 @@ class Envs: 
     | 
|
| 
       111 
111 
     | 
    
         
             
                # Model & File Download
         
     | 
| 
       112 
112 
     | 
    
         
             
                SGLANG_USE_MODELSCOPE = EnvBool(False)
         
     | 
| 
       113 
113 
     | 
    
         | 
| 
      
 114 
     | 
    
         
            +
                # Logging Options
         
     | 
| 
      
 115 
     | 
    
         
            +
                SGLANG_LOG_GC = EnvBool(False)
         
     | 
| 
      
 116 
     | 
    
         
            +
                SGLANG_LOG_FORWARD_ITERS = EnvBool(False)
         
     | 
| 
      
 117 
     | 
    
         
            +
                SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
       114 
119 
     | 
    
         
             
                # Test & Debug
         
     | 
| 
       115 
120 
     | 
    
         
             
                SGLANG_IS_IN_CI = EnvBool(False)
         
     | 
| 
       116 
121 
     | 
    
         
             
                SGLANG_IS_IN_CI_AMD = EnvBool(False)
         
     | 
| 
       117 
122 
     | 
    
         
             
                SGLANG_SET_CPU_AFFINITY = EnvBool(False)
         
     | 
| 
       118 
123 
     | 
    
         
             
                SGLANG_PROFILE_WITH_STACK = EnvBool(True)
         
     | 
| 
       119 
124 
     | 
    
         
             
                SGLANG_RECORD_STEP_TIME = EnvBool(False)
         
     | 
| 
       120 
     | 
    
         
            -
                SGLANG_GC_LOG = EnvBool(False)
         
     | 
| 
       121 
125 
     | 
    
         
             
                SGLANG_FORCE_SHUTDOWN = EnvBool(False)
         
     | 
| 
       122 
126 
     | 
    
         
             
                SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
         
     | 
| 
       123 
127 
     | 
    
         
             
                SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
         
     | 
| 
       124 
128 
     | 
    
         
             
                SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
         
     | 
| 
       125 
     | 
    
         
            -
                SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
         
     | 
| 
       126 
129 
     | 
    
         
             
                SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
         
     | 
| 
       127 
130 
     | 
    
         
             
                SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
         
     | 
| 
       128 
131 
     | 
    
         
             
                SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
         
     | 
| 
      
 132 
     | 
    
         
            +
                SGLANG_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS = EnvInt(500)
         
     | 
| 
      
 133 
     | 
    
         
            +
                SGLANG_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE = EnvInt(64)
         
     | 
| 
       129 
134 
     | 
    
         | 
| 
       130 
135 
     | 
    
         
             
                # Scheduler: memory leak test
         
     | 
| 
       131 
136 
     | 
    
         
             
                SGLANG_TEST_RETRACT = EnvBool(False)
         
     | 
| 
       132 
137 
     | 
    
         
             
                SGLANG_TEST_RETRACT_INTERVAL = EnvInt(3)
         
     | 
| 
      
 138 
     | 
    
         
            +
                SGLANG_TEST_RETRACT_NO_PREFILL_BS = EnvInt(2 ** 31)
         
     | 
| 
       133 
139 
     | 
    
         
             
                SGLANG_ENABLE_RUNTIME_MEM_LEAK_CHECK = EnvBool(False)
         
     | 
| 
       134 
140 
     | 
    
         | 
| 
       135 
141 
     | 
    
         
             
                # Scheduler: new token ratio hyperparameters
         
     | 
| 
         @@ -177,6 +183,7 @@ class Envs: 
     | 
|
| 
       177 
183 
     | 
    
         | 
| 
       178 
184 
     | 
    
         
             
                # Triton
         
     | 
| 
       179 
185 
     | 
    
         
             
                SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
         
     | 
| 
      
 186 
     | 
    
         
            +
                SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE = EnvBool(False)
         
     | 
| 
       180 
187 
     | 
    
         | 
| 
       181 
188 
     | 
    
         
             
                # Torch Compile
         
     | 
| 
       182 
189 
     | 
    
         
             
                SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
         
     | 
| 
         @@ -228,12 +235,16 @@ class Envs: 
     | 
|
| 
       228 
235 
     | 
    
         
             
                SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
         
     | 
| 
       229 
236 
     | 
    
         | 
| 
       230 
237 
     | 
    
         
             
                # Overlap Spec V2
         
     | 
| 
      
 238 
     | 
    
         
            +
                SGLANG_ENABLE_SPEC_V2 = EnvBool(False)
         
     | 
| 
       231 
239 
     | 
    
         
             
                SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
         
     | 
| 
       232 
240 
     | 
    
         | 
| 
       233 
241 
     | 
    
         
             
                # VLM
         
     | 
| 
       234 
242 
     | 
    
         
             
                SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
         
     | 
| 
       235 
243 
     | 
    
         
             
                SGLANG_RESIZE_RESAMPLE = EnvStr("")
         
     | 
| 
       236 
244 
     | 
    
         | 
| 
      
 245 
     | 
    
         
            +
                # Release & Resume Memory
         
     | 
| 
      
 246 
     | 
    
         
            +
                SGLANG_MEMORY_SAVER_CUDA_GRAPH = EnvBool(False)
         
     | 
| 
      
 247 
     | 
    
         
            +
             
     | 
| 
       237 
248 
     | 
    
         
             
                # Ktransformers
         
     | 
| 
       238 
249 
     | 
    
         
             
                SGLANG_KT_MOE_NUM_GPU_EXPERTS = EnvInt(None)
         
     | 
| 
       239 
250 
     | 
    
         
             
                SGLANG_KT_MOE_CPUINFER = EnvInt(None)
         
     | 
| 
         @@ -251,7 +262,17 @@ class Envs: 
     | 
|
| 
       251 
262 
     | 
    
         
             
            envs = Envs()
         
     | 
| 
       252 
263 
     | 
    
         | 
| 
       253 
264 
     | 
    
         | 
| 
      
 265 
     | 
    
         
            +
            def _print_deprecated_env(new_name: str, old_name: str):
         
     | 
| 
      
 266 
     | 
    
         
            +
                if old_name in os.environ:
         
     | 
| 
      
 267 
     | 
    
         
            +
                    warnings.warn(
         
     | 
| 
      
 268 
     | 
    
         
            +
                        f"Environment variable {old_name} will be deprecated, please use {new_name} instead"
         
     | 
| 
      
 269 
     | 
    
         
            +
                    )
         
     | 
| 
      
 270 
     | 
    
         
            +
                    os.environ[new_name] = os.environ[old_name]
         
     | 
| 
      
 271 
     | 
    
         
            +
             
     | 
| 
      
 272 
     | 
    
         
            +
             
     | 
| 
       254 
273 
     | 
    
         
             
            def _convert_SGL_to_SGLANG():
         
     | 
| 
      
 274 
     | 
    
         
            +
                _print_deprecated_env("SGLANG_LOG_GC", "SGLANG_GC_LOG")
         
     | 
| 
      
 275 
     | 
    
         
            +
             
     | 
| 
       255 
276 
     | 
    
         
             
                for key, value in os.environ.items():
         
     | 
| 
       256 
277 
     | 
    
         
             
                    if key.startswith("SGL_"):
         
     | 
| 
       257 
278 
     | 
    
         
             
                        new_key = key.replace("SGL_", "SGLANG_", 1)
         
     |