sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +56 -12
 - sglang/launch_server.py +2 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
 - sglang/srt/compilation/backend.py +1 -1
 - sglang/srt/configs/model_config.py +5 -5
 - sglang/srt/distributed/parallel_state.py +0 -7
 - sglang/srt/entrypoints/engine.py +18 -15
 - sglang/srt/entrypoints/grpc_server.py +0 -1
 - sglang/srt/entrypoints/http_server.py +75 -94
 - sglang/srt/environ.py +16 -2
 - sglang/srt/eplb/expert_distribution.py +30 -0
 - sglang/srt/function_call/function_call_parser.py +2 -0
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/layers/activation.py +6 -0
 - sglang/srt/layers/attention/flashattention_backend.py +12 -2
 - sglang/srt/layers/attention/flashinfer_backend.py +10 -1
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
 - sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
 - sglang/srt/layers/attention/utils.py +78 -0
 - sglang/srt/layers/communicator.py +1 -0
 - sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
 - sglang/srt/layers/layernorm.py +19 -4
 - sglang/srt/layers/logits_processor.py +5 -0
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
 - sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
 - sglang/srt/layers/moe/ep_moe/layer.py +79 -272
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
 - sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
 - sglang/srt/layers/moe/topk.py +4 -4
 - sglang/srt/layers/moe/utils.py +3 -4
 - sglang/srt/layers/quantization/__init__.py +3 -5
 - sglang/srt/layers/quantization/awq.py +0 -3
 - sglang/srt/layers/quantization/base_config.py +7 -0
 - sglang/srt/layers/quantization/fp8.py +68 -63
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/mxfp4.py +30 -38
 - sglang/srt/layers/quantization/unquant.py +23 -45
 - sglang/srt/layers/quantization/w4afp8.py +38 -2
 - sglang/srt/layers/radix_attention.py +5 -2
 - sglang/srt/layers/rotary_embedding.py +13 -1
 - sglang/srt/layers/sampler.py +12 -1
 - sglang/srt/managers/io_struct.py +3 -0
 - sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
 - sglang/srt/managers/scheduler.py +21 -15
 - sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
 - sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
 - sglang/srt/managers/tokenizer_manager.py +11 -19
 - sglang/srt/mem_cache/hicache_storage.py +7 -1
 - sglang/srt/mem_cache/memory_pool.py +82 -0
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/model_executor/forward_batch_info.py +44 -3
 - sglang/srt/model_executor/model_runner.py +1 -149
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
 - sglang/srt/models/deepseek_v2.py +147 -44
 - sglang/srt/models/glm4_moe.py +322 -354
 - sglang/srt/models/glm4_moe_nextn.py +4 -14
 - sglang/srt/models/glm4v_moe.py +29 -196
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/qwen2.py +22 -1
 - sglang/srt/models/qwen3.py +34 -4
 - sglang/srt/models/qwen3_moe.py +2 -4
 - sglang/srt/multimodal/processors/base_processor.py +1 -0
 - sglang/srt/multimodal/processors/glm4v.py +1 -1
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
 - sglang/srt/parser/reasoning_parser.py +28 -1
 - sglang/srt/server_args.py +365 -186
 - sglang/srt/single_batch_overlap.py +2 -7
 - sglang/srt/utils/common.py +87 -42
 - sglang/srt/utils/hf_transformers_utils.py +7 -3
 - sglang/test/test_deterministic.py +235 -12
 - sglang/test/test_deterministic_utils.py +2 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
 - sglang/srt/models/vila.py +0 -306
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
 
| 
         @@ -20,7 +20,7 @@ This file implements HTTP APIs for the inference engine via fastapi. 
     | 
|
| 
       20 
20 
     | 
    
         
             
            import asyncio
         
     | 
| 
       21 
21 
     | 
    
         
             
            import dataclasses
         
     | 
| 
       22 
22 
     | 
    
         
             
            import logging
         
     | 
| 
       23 
     | 
    
         
            -
            import multiprocessing 
     | 
| 
      
 23 
     | 
    
         
            +
            import multiprocessing
         
     | 
| 
       24 
24 
     | 
    
         
             
            import os
         
     | 
| 
       25 
25 
     | 
    
         
             
            import tempfile
         
     | 
| 
       26 
26 
     | 
    
         
             
            import threading
         
     | 
| 
         @@ -165,6 +165,7 @@ async def init_multi_tokenizer() -> ServerArgs: 
     | 
|
| 
       165 
165 
     | 
    
         
             
                    server_args.api_key is None
         
     | 
| 
       166 
166 
     | 
    
         
             
                ), "API key is not supported in multi-tokenizer mode"
         
     | 
| 
       167 
167 
     | 
    
         | 
| 
      
 168 
     | 
    
         
            +
                # Create a new ipc name for the current process
         
     | 
| 
       168 
169 
     | 
    
         
             
                port_args.tokenizer_ipc_name = (
         
     | 
| 
       169 
170 
     | 
    
         
             
                    f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
         
     | 
| 
       170 
171 
     | 
    
         
             
                )
         
     | 
| 
         @@ -184,6 +185,7 @@ async def init_multi_tokenizer() -> ServerArgs: 
     | 
|
| 
       184 
185 
     | 
    
         
             
                )
         
     | 
| 
       185 
186 
     | 
    
         | 
| 
       186 
187 
     | 
    
         
             
                tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
         
     | 
| 
      
 188 
     | 
    
         
            +
             
     | 
| 
       187 
189 
     | 
    
         
             
                set_global_state(
         
     | 
| 
       188 
190 
     | 
    
         
             
                    _GlobalState(
         
     | 
| 
       189 
191 
     | 
    
         
             
                        tokenizer_manager=tokenizer_manager,
         
     | 
| 
         @@ -192,36 +194,35 @@ async def init_multi_tokenizer() -> ServerArgs: 
     | 
|
| 
       192 
194 
     | 
    
         
             
                    )
         
     | 
| 
       193 
195 
     | 
    
         
             
                )
         
     | 
| 
       194 
196 
     | 
    
         | 
| 
       195 
     | 
    
         
            -
                if server_args.enable_trace:
         
     | 
| 
       196 
     | 
    
         
            -
                    process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
         
     | 
| 
       197 
     | 
    
         
            -
                    if server_args.disaggregation_mode == "null":
         
     | 
| 
       198 
     | 
    
         
            -
                        thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
         
     | 
| 
       199 
     | 
    
         
            -
                        trace_set_thread_info(thread_label)
         
     | 
| 
       200 
     | 
    
         
            -
             
     | 
| 
       201 
197 
     | 
    
         
             
                return server_args
         
     | 
| 
       202 
198 
     | 
    
         | 
| 
       203 
199 
     | 
    
         | 
| 
       204 
200 
     | 
    
         
             
            @asynccontextmanager
         
     | 
| 
       205 
201 
     | 
    
         
             
            async def lifespan(fast_api_app: FastAPI):
         
     | 
| 
       206 
     | 
    
         
            -
                if  
     | 
| 
      
 202 
     | 
    
         
            +
                if getattr(fast_api_app, "is_single_tokenizer_mode", False):
         
     | 
| 
      
 203 
     | 
    
         
            +
                    server_args = fast_api_app.server_args
         
     | 
| 
      
 204 
     | 
    
         
            +
                    warmup_thread_args = fast_api_app.warmup_thread_args
         
     | 
| 
      
 205 
     | 
    
         
            +
                    thread_label = "Tokenizer"
         
     | 
| 
      
 206 
     | 
    
         
            +
                else:
         
     | 
| 
       207 
207 
     | 
    
         
             
                    # Initialize multi-tokenizer support for worker processes
         
     | 
| 
       208 
     | 
    
         
            -
                     
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
             
     | 
| 
       211 
     | 
    
         
            -
             
     | 
| 
       212 
     | 
    
         
            -
             
     | 
| 
       213 
     | 
    
         
            -
                        add_prometheus_middleware(app)
         
     | 
| 
       214 
     | 
    
         
            -
                        enable_func_timer()
         
     | 
| 
       215 
     | 
    
         
            -
             
     | 
| 
       216 
     | 
    
         
            -
                    logger.info(f"Worker {worker_pid} added prometheus middleware")
         
     | 
| 
       217 
     | 
    
         
            -
                    fast_api_app.warmup_thread = threading.Thread(
         
     | 
| 
       218 
     | 
    
         
            -
                        target=_wait_and_warmup,
         
     | 
| 
       219 
     | 
    
         
            -
                        args=(
         
     | 
| 
       220 
     | 
    
         
            -
                            fast_api_app.server_args,
         
     | 
| 
       221 
     | 
    
         
            -
                            None,  # pipe_finish_writer not needed in worker
         
     | 
| 
       222 
     | 
    
         
            -
                            None,  # launch_callback not needed in worker
         
     | 
| 
       223 
     | 
    
         
            -
                        ),
         
     | 
| 
      
 208 
     | 
    
         
            +
                    server_args = await init_multi_tokenizer()
         
     | 
| 
      
 209 
     | 
    
         
            +
                    warmup_thread_args = (
         
     | 
| 
      
 210 
     | 
    
         
            +
                        server_args,
         
     | 
| 
      
 211 
     | 
    
         
            +
                        None,
         
     | 
| 
      
 212 
     | 
    
         
            +
                        None,
         
     | 
| 
       224 
213 
     | 
    
         
             
                    )
         
     | 
| 
      
 214 
     | 
    
         
            +
                    thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}"
         
     | 
| 
      
 215 
     | 
    
         
            +
             
     | 
| 
      
 216 
     | 
    
         
            +
                # Add prometheus middleware
         
     | 
| 
      
 217 
     | 
    
         
            +
                if server_args.enable_metrics:
         
     | 
| 
      
 218 
     | 
    
         
            +
                    add_prometheus_middleware(app)
         
     | 
| 
      
 219 
     | 
    
         
            +
                    enable_func_timer()
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
                # Init tracing
         
     | 
| 
      
 222 
     | 
    
         
            +
                if server_args.enable_trace:
         
     | 
| 
      
 223 
     | 
    
         
            +
                    process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
         
     | 
| 
      
 224 
     | 
    
         
            +
                    if server_args.disaggregation_mode == "null":
         
     | 
| 
      
 225 
     | 
    
         
            +
                        trace_set_thread_info(thread_label)
         
     | 
| 
       225 
226 
     | 
    
         | 
| 
       226 
227 
     | 
    
         
             
                # Initialize OpenAI serving handlers
         
     | 
| 
       227 
228 
     | 
    
         
             
                fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
         
     | 
| 
         @@ -249,8 +250,7 @@ async def lifespan(fast_api_app: FastAPI): 
     | 
|
| 
       249 
250 
     | 
    
         
             
                    _global_state.tokenizer_manager
         
     | 
| 
       250 
251 
     | 
    
         
             
                )
         
     | 
| 
       251 
252 
     | 
    
         | 
| 
       252 
     | 
    
         
            -
                 
     | 
| 
       253 
     | 
    
         
            -
             
     | 
| 
      
 253 
     | 
    
         
            +
                # Launch tool server
         
     | 
| 
       254 
254 
     | 
    
         
             
                tool_server = None
         
     | 
| 
       255 
255 
     | 
    
         
             
                if server_args.tool_server == "demo":
         
     | 
| 
       256 
256 
     | 
    
         
             
                    from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
         
     | 
| 
         @@ -274,12 +274,11 @@ async def lifespan(fast_api_app: FastAPI): 
     | 
|
| 
       274 
274 
     | 
    
         
             
                        enable_force_include_usage=True,
         
     | 
| 
       275 
275 
     | 
    
         
             
                        tool_server=tool_server,
         
     | 
| 
       276 
276 
     | 
    
         
             
                    )
         
     | 
| 
       277 
     | 
    
         
            -
                except Exception 
     | 
| 
       278 
     | 
    
         
            -
                     
     | 
| 
       279 
     | 
    
         
            -
             
     | 
| 
       280 
     | 
    
         
            -
                    traceback.print_exc()
         
     | 
| 
       281 
     | 
    
         
            -
                    logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
         
     | 
| 
      
 277 
     | 
    
         
            +
                except Exception:
         
     | 
| 
      
 278 
     | 
    
         
            +
                    traceback = get_exception_traceback()
         
     | 
| 
      
 279 
     | 
    
         
            +
                    logger.warning(f"Can not initialize OpenAIServingResponses, error: {traceback}")
         
     | 
| 
       282 
280 
     | 
    
         | 
| 
      
 281 
     | 
    
         
            +
                # Execute custom warmups
         
     | 
| 
       283 
282 
     | 
    
         
             
                if server_args.warmups is not None:
         
     | 
| 
       284 
283 
     | 
    
         
             
                    await execute_warmups(
         
     | 
| 
       285 
284 
     | 
    
         
             
                        server_args.disaggregation_mode,
         
     | 
| 
         @@ -288,18 +287,18 @@ async def lifespan(fast_api_app: FastAPI): 
     | 
|
| 
       288 
287 
     | 
    
         
             
                    )
         
     | 
| 
       289 
288 
     | 
    
         
             
                    logger.info("Warmup ended")
         
     | 
| 
       290 
289 
     | 
    
         | 
| 
       291 
     | 
    
         
            -
                 
     | 
| 
       292 
     | 
    
         
            -
                 
     | 
| 
       293 
     | 
    
         
            -
                     
     | 
| 
      
 290 
     | 
    
         
            +
                # Execute the general warmup
         
     | 
| 
      
 291 
     | 
    
         
            +
                warmup_thread = threading.Thread(
         
     | 
| 
      
 292 
     | 
    
         
            +
                    target=_wait_and_warmup,
         
     | 
| 
      
 293 
     | 
    
         
            +
                    args=warmup_thread_args,
         
     | 
| 
      
 294 
     | 
    
         
            +
                )
         
     | 
| 
      
 295 
     | 
    
         
            +
                warmup_thread.start()
         
     | 
| 
       294 
296 
     | 
    
         | 
| 
      
 297 
     | 
    
         
            +
                # Start the HTTP server
         
     | 
| 
       295 
298 
     | 
    
         
             
                try:
         
     | 
| 
       296 
299 
     | 
    
         
             
                    yield
         
     | 
| 
       297 
300 
     | 
    
         
             
                finally:
         
     | 
| 
       298 
     | 
    
         
            -
                     
     | 
| 
       299 
     | 
    
         
            -
                        pid = os.getpid()
         
     | 
| 
       300 
     | 
    
         
            -
                        logger.info(f"uvicorn worker {pid} ending...")
         
     | 
| 
       301 
     | 
    
         
            -
                        warmup_thread.join()
         
     | 
| 
       302 
     | 
    
         
            -
                        logger.info(f"uvicorn worker {pid} ended.")
         
     | 
| 
      
 301 
     | 
    
         
            +
                    warmup_thread.join()
         
     | 
| 
       303 
302 
     | 
    
         | 
| 
       304 
303 
     | 
    
         | 
| 
       305 
304 
     | 
    
         
             
            # Fast API
         
     | 
| 
         @@ -499,6 +498,11 @@ async def get_server_info(): 
     | 
|
| 
       499 
498 
     | 
    
         
             
                internal_states: List[Dict[Any, Any]] = (
         
     | 
| 
       500 
499 
     | 
    
         
             
                    await _global_state.tokenizer_manager.get_internal_state()
         
     | 
| 
       501 
500 
     | 
    
         
             
                )
         
     | 
| 
      
 501 
     | 
    
         
            +
             
     | 
| 
      
 502 
     | 
    
         
            +
                # This field is not serializable.
         
     | 
| 
      
 503 
     | 
    
         
            +
                if hasattr(_global_state.tokenizer_manager.server_args, "model_config"):
         
     | 
| 
      
 504 
     | 
    
         
            +
                    del _global_state.tokenizer_manager.server_args.model_config
         
     | 
| 
      
 505 
     | 
    
         
            +
             
     | 
| 
       502 
506 
     | 
    
         
             
                return {
         
     | 
| 
       503 
507 
     | 
    
         
             
                    **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
         
     | 
| 
       504 
508 
     | 
    
         
             
                    **_global_state.scheduler_info,
         
     | 
| 
         @@ -1328,27 +1332,12 @@ def launch_server( 
     | 
|
| 
       1328 
1332 
     | 
    
         
             
                    3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
         
     | 
| 
       1329 
1333 
     | 
    
         | 
| 
       1330 
1334 
     | 
    
         
             
                Note:
         
     | 
| 
       1331 
     | 
    
         
            -
                1. The HTTP server, Engine, and TokenizerManager  
     | 
| 
      
 1335 
     | 
    
         
            +
                1. The HTTP server, Engine, and TokenizerManager all run in the main process.
         
     | 
| 
       1332 
1336 
     | 
    
         
             
                2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
         
     | 
| 
       1333 
1337 
     | 
    
         
             
                """
         
     | 
| 
       1334 
     | 
    
         
            -
                 
     | 
| 
       1335 
     | 
    
         
            -
                     
     | 
| 
       1336 
     | 
    
         
            -
             
     | 
| 
       1337 
     | 
    
         
            -
                        f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
         
     | 
| 
       1338 
     | 
    
         
            -
                    )
         
     | 
| 
       1339 
     | 
    
         
            -
                    tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
         
     | 
| 
       1340 
     | 
    
         
            -
                        server_args=server_args, port_args=port_args
         
     | 
| 
       1341 
     | 
    
         
            -
                    )
         
     | 
| 
       1342 
     | 
    
         
            -
                else:
         
     | 
| 
       1343 
     | 
    
         
            -
                    tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
         
     | 
| 
       1344 
     | 
    
         
            -
                        server_args=server_args,
         
     | 
| 
       1345 
     | 
    
         
            -
                    )
         
     | 
| 
       1346 
     | 
    
         
            -
             
     | 
| 
       1347 
     | 
    
         
            -
                    if server_args.enable_trace:
         
     | 
| 
       1348 
     | 
    
         
            -
                        process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
         
     | 
| 
       1349 
     | 
    
         
            -
                        if server_args.disaggregation_mode == "null":
         
     | 
| 
       1350 
     | 
    
         
            -
                            thread_label = "Tokenizer"
         
     | 
| 
       1351 
     | 
    
         
            -
                            trace_set_thread_info(thread_label)
         
     | 
| 
      
 1338 
     | 
    
         
            +
                tokenizer_manager, template_manager, scheduler_info, port_args = (
         
     | 
| 
      
 1339 
     | 
    
         
            +
                    _launch_subprocesses(server_args=server_args)
         
     | 
| 
      
 1340 
     | 
    
         
            +
                )
         
     | 
| 
       1352 
1341 
     | 
    
         | 
| 
       1353 
1342 
     | 
    
         
             
                set_global_state(
         
     | 
| 
       1354 
1343 
     | 
    
         
             
                    _GlobalState(
         
     | 
| 
         @@ -1358,40 +1347,45 @@ def launch_server( 
     | 
|
| 
       1358 
1347 
     | 
    
         
             
                    )
         
     | 
| 
       1359 
1348 
     | 
    
         
             
                )
         
     | 
| 
       1360 
1349 
     | 
    
         | 
| 
       1361 
     | 
    
         
            -
                 
     | 
| 
       1362 
     | 
    
         
            -
             
     | 
| 
       1363 
     | 
    
         
            -
             
     | 
| 
      
 1350 
     | 
    
         
            +
                # Pass additional arguments to the lifespan function.
         
     | 
| 
      
 1351 
     | 
    
         
            +
                # They will be used for additional initialization setups.
         
     | 
| 
      
 1352 
     | 
    
         
            +
                if server_args.tokenizer_worker_num == 1:
         
     | 
| 
      
 1353 
     | 
    
         
            +
                    # If it is single tokenizer mode, we can pass the arguments by attributes of the app object.
         
     | 
| 
      
 1354 
     | 
    
         
            +
                    app.is_single_tokenizer_mode = True
         
     | 
| 
      
 1355 
     | 
    
         
            +
                    app.server_args = server_args
         
     | 
| 
      
 1356 
     | 
    
         
            +
                    app.warmup_thread_args = (
         
     | 
| 
       1364 
1357 
     | 
    
         
             
                        server_args,
         
     | 
| 
       1365 
     | 
    
         
            -
                         
     | 
| 
      
 1358 
     | 
    
         
            +
                        pipe_finish_writer,
         
     | 
| 
      
 1359 
     | 
    
         
            +
                        launch_callback,
         
     | 
| 
       1366 
1360 
     | 
    
         
             
                    )
         
     | 
| 
       1367 
     | 
    
         
            -
             
     | 
| 
      
 1361 
     | 
    
         
            +
             
     | 
| 
       1368 
1362 
     | 
    
         
             
                    # Add api key authorization
         
     | 
| 
      
 1363 
     | 
    
         
            +
                    # This is only supported in single tokenizer mode.
         
     | 
| 
       1369 
1364 
     | 
    
         
             
                    if server_args.api_key:
         
     | 
| 
       1370 
1365 
     | 
    
         
             
                        add_api_key_middleware(app, server_args.api_key)
         
     | 
| 
       1371 
     | 
    
         
            -
             
     | 
| 
       1372 
     | 
    
         
            -
                    #  
     | 
| 
       1373 
     | 
    
         
            -
                     
     | 
| 
       1374 
     | 
    
         
            -
             
     | 
| 
       1375 
     | 
    
         
            -
             
     | 
| 
       1376 
     | 
    
         
            -
             
     | 
| 
       1377 
     | 
    
         
            -
                    # Send a warmup request - we will create the thread launch it
         
     | 
| 
       1378 
     | 
    
         
            -
                    # in the lifespan after all other warmups have fired.
         
     | 
| 
       1379 
     | 
    
         
            -
                    warmup_thread = threading.Thread(
         
     | 
| 
       1380 
     | 
    
         
            -
                        target=_wait_and_warmup,
         
     | 
| 
       1381 
     | 
    
         
            -
                        args=(
         
     | 
| 
       1382 
     | 
    
         
            -
                            server_args,
         
     | 
| 
       1383 
     | 
    
         
            -
                            pipe_finish_writer,
         
     | 
| 
       1384 
     | 
    
         
            -
                            launch_callback,
         
     | 
| 
       1385 
     | 
    
         
            -
                        ),
         
     | 
| 
      
 1366 
     | 
    
         
            +
                else:
         
     | 
| 
      
 1367 
     | 
    
         
            +
                    # If it is multi-tokenizer mode, we need to write the arguments to shared memory
         
     | 
| 
      
 1368 
     | 
    
         
            +
                    # for other worker processes to read.
         
     | 
| 
      
 1369 
     | 
    
         
            +
                    app.is_single_tokenizer_mode = False
         
     | 
| 
      
 1370 
     | 
    
         
            +
                    multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
         
     | 
| 
      
 1371 
     | 
    
         
            +
                        port_args, server_args, scheduler_info
         
     | 
| 
       1386 
1372 
     | 
    
         
             
                    )
         
     | 
| 
       1387 
     | 
    
         
            -
                    app.warmup_thread = warmup_thread
         
     | 
| 
       1388 
1373 
     | 
    
         | 
| 
       1389 
1374 
     | 
    
         
             
                try:
         
     | 
| 
       1390 
1375 
     | 
    
         
             
                    # Update logging configs
         
     | 
| 
       1391 
1376 
     | 
    
         
             
                    set_uvicorn_logging_configs()
         
     | 
| 
       1392 
     | 
    
         
            -
             
     | 
| 
      
 1377 
     | 
    
         
            +
             
     | 
| 
       1393 
1378 
     | 
    
         
             
                    # Listen for HTTP requests
         
     | 
| 
       1394 
     | 
    
         
            -
                    if server_args.tokenizer_worker_num  
     | 
| 
      
 1379 
     | 
    
         
            +
                    if server_args.tokenizer_worker_num == 1:
         
     | 
| 
      
 1380 
     | 
    
         
            +
                        uvicorn.run(
         
     | 
| 
      
 1381 
     | 
    
         
            +
                            app,
         
     | 
| 
      
 1382 
     | 
    
         
            +
                            host=server_args.host,
         
     | 
| 
      
 1383 
     | 
    
         
            +
                            port=server_args.port,
         
     | 
| 
      
 1384 
     | 
    
         
            +
                            log_level=server_args.log_level_http or server_args.log_level,
         
     | 
| 
      
 1385 
     | 
    
         
            +
                            timeout_keep_alive=5,
         
     | 
| 
      
 1386 
     | 
    
         
            +
                            loop="uvloop",
         
     | 
| 
      
 1387 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1388 
     | 
    
         
            +
                    else:
         
     | 
| 
       1395 
1389 
     | 
    
         
             
                        from uvicorn.config import LOGGING_CONFIG
         
     | 
| 
       1396 
1390 
     | 
    
         | 
| 
       1397 
1391 
     | 
    
         
             
                        LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
         
     | 
| 
         @@ -1399,7 +1393,6 @@ def launch_server( 
     | 
|
| 
       1399 
1393 
     | 
    
         
             
                            "level": "INFO",
         
     | 
| 
       1400 
1394 
     | 
    
         
             
                            "propagate": False,
         
     | 
| 
       1401 
1395 
     | 
    
         
             
                        }
         
     | 
| 
       1402 
     | 
    
         
            -
             
     | 
| 
       1403 
1396 
     | 
    
         
             
                        monkey_patch_uvicorn_multiprocessing()
         
     | 
| 
       1404 
1397 
     | 
    
         | 
| 
       1405 
1398 
     | 
    
         
             
                        uvicorn.run(
         
     | 
| 
         @@ -1411,22 +1404,10 @@ def launch_server( 
     | 
|
| 
       1411 
1404 
     | 
    
         
             
                            loop="uvloop",
         
     | 
| 
       1412 
1405 
     | 
    
         
             
                            workers=server_args.tokenizer_worker_num,
         
     | 
| 
       1413 
1406 
     | 
    
         
             
                        )
         
     | 
| 
       1414 
     | 
    
         
            -
                    else:
         
     | 
| 
       1415 
     | 
    
         
            -
                        app.is_single_tokenizer_mode = True
         
     | 
| 
       1416 
     | 
    
         
            -
                        uvicorn.run(
         
     | 
| 
       1417 
     | 
    
         
            -
                            app,
         
     | 
| 
       1418 
     | 
    
         
            -
                            host=server_args.host,
         
     | 
| 
       1419 
     | 
    
         
            -
                            port=server_args.port,
         
     | 
| 
       1420 
     | 
    
         
            -
                            log_level=server_args.log_level_http or server_args.log_level,
         
     | 
| 
       1421 
     | 
    
         
            -
                            timeout_keep_alive=5,
         
     | 
| 
       1422 
     | 
    
         
            -
                            loop="uvloop",
         
     | 
| 
       1423 
     | 
    
         
            -
                        )
         
     | 
| 
       1424 
1407 
     | 
    
         
             
                finally:
         
     | 
| 
       1425 
1408 
     | 
    
         
             
                    if server_args.tokenizer_worker_num > 1:
         
     | 
| 
       1426 
1409 
     | 
    
         
             
                        multi_tokenizer_args_shm.unlink()
         
     | 
| 
       1427 
1410 
     | 
    
         
             
                        _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
         
     | 
| 
       1428 
     | 
    
         
            -
                    else:
         
     | 
| 
       1429 
     | 
    
         
            -
                        warmup_thread.join()
         
     | 
| 
       1430 
1411 
     | 
    
         | 
| 
       1431 
1412 
     | 
    
         | 
| 
       1432 
1413 
     | 
    
         
             
            def _execute_server_warmup(
         
     | 
    
        sglang/srt/environ.py
    CHANGED
    
    | 
         @@ -111,18 +111,21 @@ class Envs: 
     | 
|
| 
       111 
111 
     | 
    
         
             
                # Model & File Download
         
     | 
| 
       112 
112 
     | 
    
         
             
                SGLANG_USE_MODELSCOPE = EnvBool(False)
         
     | 
| 
       113 
113 
     | 
    
         | 
| 
      
 114 
     | 
    
         
            +
                # Logging Options
         
     | 
| 
      
 115 
     | 
    
         
            +
                SGLANG_LOG_GC = EnvBool(False)
         
     | 
| 
      
 116 
     | 
    
         
            +
                SGLANG_LOG_FORWARD_ITERS = EnvBool(False)
         
     | 
| 
      
 117 
     | 
    
         
            +
                SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
       114 
119 
     | 
    
         
             
                # Test & Debug
         
     | 
| 
       115 
120 
     | 
    
         
             
                SGLANG_IS_IN_CI = EnvBool(False)
         
     | 
| 
       116 
121 
     | 
    
         
             
                SGLANG_IS_IN_CI_AMD = EnvBool(False)
         
     | 
| 
       117 
122 
     | 
    
         
             
                SGLANG_SET_CPU_AFFINITY = EnvBool(False)
         
     | 
| 
       118 
123 
     | 
    
         
             
                SGLANG_PROFILE_WITH_STACK = EnvBool(True)
         
     | 
| 
       119 
124 
     | 
    
         
             
                SGLANG_RECORD_STEP_TIME = EnvBool(False)
         
     | 
| 
       120 
     | 
    
         
            -
                SGLANG_GC_LOG = EnvBool(False)
         
     | 
| 
       121 
125 
     | 
    
         
             
                SGLANG_FORCE_SHUTDOWN = EnvBool(False)
         
     | 
| 
       122 
126 
     | 
    
         
             
                SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
         
     | 
| 
       123 
127 
     | 
    
         
             
                SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
         
     | 
| 
       124 
128 
     | 
    
         
             
                SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
         
     | 
| 
       125 
     | 
    
         
            -
                SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
         
     | 
| 
       126 
129 
     | 
    
         
             
                SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
         
     | 
| 
       127 
130 
     | 
    
         
             
                SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
         
     | 
| 
       128 
131 
     | 
    
         
             
                SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
         
     | 
| 
         @@ -228,6 +231,7 @@ class Envs: 
     | 
|
| 
       228 
231 
     | 
    
         
             
                SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
         
     | 
| 
       229 
232 
     | 
    
         | 
| 
       230 
233 
     | 
    
         
             
                # Overlap Spec V2
         
     | 
| 
      
 234 
     | 
    
         
            +
                SGLANG_ENABLE_SPEC_V2 = EnvBool(False)
         
     | 
| 
       231 
235 
     | 
    
         
             
                SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
         
     | 
| 
       232 
236 
     | 
    
         | 
| 
       233 
237 
     | 
    
         
             
                # VLM
         
     | 
| 
         @@ -251,7 +255,17 @@ class Envs: 
     | 
|
| 
       251 
255 
     | 
    
         
             
            envs = Envs()
         
     | 
| 
       252 
256 
     | 
    
         | 
| 
       253 
257 
     | 
    
         | 
| 
      
 258 
     | 
    
         
            +
            def _print_deprecated_env(new_name: str, old_name: str):
         
     | 
| 
      
 259 
     | 
    
         
            +
                if old_name in os.environ:
         
     | 
| 
      
 260 
     | 
    
         
            +
                    warnings.warn(
         
     | 
| 
      
 261 
     | 
    
         
            +
                        f"Environment variable {old_name} will be deprecated, please use {new_name} instead"
         
     | 
| 
      
 262 
     | 
    
         
            +
                    )
         
     | 
| 
      
 263 
     | 
    
         
            +
                    os.environ[new_name] = os.environ[old_name]
         
     | 
| 
      
 264 
     | 
    
         
            +
             
     | 
| 
      
 265 
     | 
    
         
            +
             
     | 
| 
       254 
266 
     | 
    
         
             
            def _convert_SGL_to_SGLANG():
         
     | 
| 
      
 267 
     | 
    
         
            +
                _print_deprecated_env("SGLANG_LOG_GC", "SGLANG_GC_LOG")
         
     | 
| 
      
 268 
     | 
    
         
            +
             
     | 
| 
       255 
269 
     | 
    
         
             
                for key, value in os.environ.items():
         
     | 
| 
       256 
270 
     | 
    
         
             
                    if key.startswith("SGL_"):
         
     | 
| 
       257 
271 
     | 
    
         
             
                        new_key = key.replace("SGL_", "SGLANG_", 1)
         
     | 
| 
         @@ -415,10 +415,19 @@ class _DetailSinglePassGatherer(_SinglePassGatherer): 
     | 
|
| 
       415 
415 
     | 
    
         | 
| 
       416 
416 
     | 
    
         
             
                def collect(self) -> Dict:
         
     | 
| 
       417 
417 
     | 
    
         
             
                    num_tokens = len(self._metadata["input_ids"])
         
     | 
| 
      
 418 
     | 
    
         
            +
             
     | 
| 
      
 419 
     | 
    
         
            +
                    global_physical_count = _convert_per_token_to_global_physical_count(
         
     | 
| 
      
 420 
     | 
    
         
            +
                        num_tokens,
         
     | 
| 
      
 421 
     | 
    
         
            +
                        num_layers=self._expert_location_metadata.num_layers,
         
     | 
| 
      
 422 
     | 
    
         
            +
                        num_physical_experts=self._expert_location_metadata.num_physical_experts,
         
     | 
| 
      
 423 
     | 
    
         
            +
                        _topk_ids_of_layer=self._topk_ids_of_layer,
         
     | 
| 
      
 424 
     | 
    
         
            +
                    )
         
     | 
| 
      
 425 
     | 
    
         
            +
             
     | 
| 
       418 
426 
     | 
    
         
             
                    return dict(
         
     | 
| 
       419 
427 
     | 
    
         
             
                        **self._metadata,
         
     | 
| 
       420 
428 
     | 
    
         
             
                        topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
         
     | 
| 
       421 
429 
     | 
    
         
             
                        misc_objects=self._misc_objects,
         
     | 
| 
      
 430 
     | 
    
         
            +
                        global_physical_count=global_physical_count,
         
     | 
| 
       422 
431 
     | 
    
         
             
                    )
         
     | 
| 
       423 
432 
     | 
    
         | 
| 
       424 
433 
     | 
    
         | 
| 
         @@ -547,6 +556,27 @@ class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer): 
     | 
|
| 
       547 
556 
     | 
    
         
             
                    self._data[layer_idx, :] += local_physical_count_of_layer
         
     | 
| 
       548 
557 
     | 
    
         | 
| 
       549 
558 
     | 
    
         | 
| 
      
 559 
     | 
    
         
            +
            def _convert_per_token_to_global_physical_count(
         
     | 
| 
      
 560 
     | 
    
         
            +
                num_tokens: int,
         
     | 
| 
      
 561 
     | 
    
         
            +
                num_layers: int,
         
     | 
| 
      
 562 
     | 
    
         
            +
                num_physical_experts: int,
         
     | 
| 
      
 563 
     | 
    
         
            +
                _topk_ids_of_layer: torch.Tensor,
         
     | 
| 
      
 564 
     | 
    
         
            +
            ) -> torch.Tensor:
         
     | 
| 
      
 565 
     | 
    
         
            +
                topk_ids_layer_major = _topk_ids_of_layer[:, :num_tokens, :].reshape(num_layers, -1)
         
     | 
| 
      
 566 
     | 
    
         
            +
                mask = topk_ids_layer_major != -1
         
     | 
| 
      
 567 
     | 
    
         
            +
             
     | 
| 
      
 568 
     | 
    
         
            +
                index = topk_ids_layer_major.masked_fill(~mask, 0).long()
         
     | 
| 
      
 569 
     | 
    
         
            +
                src = mask.int()
         
     | 
| 
      
 570 
     | 
    
         
            +
             
     | 
| 
      
 571 
     | 
    
         
            +
                ans = torch.zeros(
         
     | 
| 
      
 572 
     | 
    
         
            +
                    (num_layers, num_physical_experts),
         
     | 
| 
      
 573 
     | 
    
         
            +
                    dtype=_topk_ids_of_layer.dtype,
         
     | 
| 
      
 574 
     | 
    
         
            +
                    device=_topk_ids_of_layer.device,
         
     | 
| 
      
 575 
     | 
    
         
            +
                )
         
     | 
| 
      
 576 
     | 
    
         
            +
                ans.scatter_add_(dim=1, index=index, src=src)
         
     | 
| 
      
 577 
     | 
    
         
            +
                return ans
         
     | 
| 
      
 578 
     | 
    
         
            +
             
     | 
| 
      
 579 
     | 
    
         
            +
             
     | 
| 
       550 
580 
     | 
    
         
             
            def _convert_local_to_global_physical_count(
         
     | 
| 
       551 
581 
     | 
    
         
             
                local_physical_count: torch.Tensor,
         
     | 
| 
       552 
582 
     | 
    
         
             
                rank: int,
         
     | 
| 
         @@ -16,6 +16,7 @@ from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector 
     | 
|
| 
       16 
16 
     | 
    
         
             
            from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
         
     | 
| 
       17 
17 
     | 
    
         
             
            from sglang.srt.function_call.kimik2_detector import KimiK2Detector
         
     | 
| 
       18 
18 
     | 
    
         
             
            from sglang.srt.function_call.llama32_detector import Llama32Detector
         
     | 
| 
      
 19 
     | 
    
         
            +
            from sglang.srt.function_call.minimax_m2 import MinimaxM2Detector
         
     | 
| 
       19 
20 
     | 
    
         
             
            from sglang.srt.function_call.mistral_detector import MistralDetector
         
     | 
| 
       20 
21 
     | 
    
         
             
            from sglang.srt.function_call.pythonic_detector import PythonicDetector
         
     | 
| 
       21 
22 
     | 
    
         
             
            from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
         
     | 
| 
         @@ -49,6 +50,7 @@ class FunctionCallParser: 
     | 
|
| 
       49 
50 
     | 
    
         
             
                    "qwen25": Qwen25Detector,
         
     | 
| 
       50 
51 
     | 
    
         
             
                    "qwen3_coder": Qwen3CoderDetector,
         
     | 
| 
       51 
52 
     | 
    
         
             
                    "step3": Step3Detector,
         
     | 
| 
      
 53 
     | 
    
         
            +
                    "minimax-m2": MinimaxM2Detector,
         
     | 
| 
       52 
54 
     | 
    
         
             
                }
         
     | 
| 
       53 
55 
     | 
    
         | 
| 
       54 
56 
     | 
    
         
             
                def __init__(self, tools: List[Tool], tool_call_parser: str):
         
     |