PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +321 -31
sglang/bench_serving.py +10 -3
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +8 -0
sglang/srt/configs/model_config.py +160 -105
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/constrained/base_grammar_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +6 -4
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/common/conn.py +266 -98
sglang/srt/disaggregation/decode.py +50 -9
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/mooncake/conn.py +51 -541
sglang/srt/disaggregation/nixl/conn.py +148 -39
sglang/srt/disaggregation/prefill.py +31 -14
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +135 -80
sglang/srt/entrypoints/engine.py +23 -3
sglang/srt/entrypoints/grpc_request_manager.py +330 -55
sglang/srt/entrypoints/grpc_server.py +232 -102
sglang/srt/entrypoints/http_server.py +49 -9
sglang/srt/entrypoints/openai/protocol.py +110 -5
sglang/srt/entrypoints/openai/serving_base.py +25 -6
sglang/srt/entrypoints/openai/serving_chat.py +178 -49
sglang/srt/entrypoints/openai/serving_completions.py +5 -3
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +42 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/function_call/function_call_parser.py +3 -2
sglang/srt/function_call/glm4_moe_detector.py +3 -3
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
sglang/srt/layers/activation.py +7 -6
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +108 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +112 -194
sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
sglang/srt/layers/attention/mamba/mamba.py +566 -1
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +42 -9
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +11 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +2 -0
sglang/srt/layers/linear.py +21 -4
sglang/srt/layers/logits_processor.py +15 -2
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +147 -74
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
sglang/srt/layers/moe/utils.py +10 -0
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/modelopt_quant.py +44 -9
sglang/srt/layers/quantization/mxfp4.py +12 -4
sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
sglang/srt/layers/quantization/w4afp8.py +0 -4
sglang/srt/layers/quantization/w8a8_int8.py +15 -3
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +52 -4
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +10 -4
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +17 -6
sglang/srt/lora/mem_pool.py +1 -1
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +42 -142
sglang/srt/managers/data_parallel_controller.py +11 -46
sglang/srt/managers/detokenizer_manager.py +11 -11
sglang/srt/managers/io_struct.py +162 -118
sglang/srt/managers/mm_utils.py +43 -6
sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +167 -86
sglang/srt/managers/schedule_policy.py +143 -16
sglang/srt/managers/scheduler.py +359 -214
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
sglang/srt/managers/tokenizer_manager.py +84 -136
sglang/srt/managers/tp_worker.py +39 -29
sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +40 -1
sglang/srt/mem_cache/hiradix_cache.py +119 -32
sglang/srt/mem_cache/memory_pool.py +188 -10
sglang/srt/mem_cache/memory_pool_host.py +134 -182
sglang/srt/mem_cache/radix_cache.py +222 -71
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
sglang/srt/mem_cache/swa_radix_cache.py +25 -34
sglang/srt/metrics/collector.py +82 -120
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +39 -32
sglang/srt/model_executor/forward_batch_info.py +23 -38
sglang/srt/model_executor/model_runner.py +131 -183
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/loader.py +14 -10
sglang/srt/model_loader/weight_utils.py +156 -2
sglang/srt/models/bailing_moe.py +27 -4
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +536 -153
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +3 -3
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +1 -1
sglang/srt/models/glm4v_moe.py +1 -1
sglang/srt/models/gpt_oss.py +7 -30
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/longcat_flash.py +1 -1
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +15 -4
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +2 -2
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +64 -1
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +31 -3
sglang/srt/models/qwen3_next.py +36 -9
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +2 -3
sglang/srt/multimodal/processors/internvl.py +20 -8
sglang/srt/multimodal/processors/qwen_vl.py +8 -1
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +20 -2
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +753 -295
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
sglang/srt/speculative/eagle_worker.py +57 -25
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +47 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +399 -74
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +1 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +12 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +355 -4
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/openai/serving_completions.py CHANGED Viewed

@@ -90,8 +90,8 @@ class OpenAIServingCompletion(OpenAIServingBase):
         else:
             prompt_kwargs = {"input_ids": prompt}
-        # Extract customer labels from raw request headers
-        customer_labels = self.extract_customer_labels(raw_request)
+        # Extract custom labels from raw request headers
+        custom_labels = self.extract_custom_labels(raw_request)
         adapted_request = GenerateReqInput(
             **prompt_kwargs,
@@ -107,7 +107,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
             bootstrap_room=request.bootstrap_room,
             return_hidden_states=request.return_hidden_states,
             rid=request.rid,
-            customer_labels=customer_labels,
+            extra_key=self._compute_extra_key(request),
+            priority=request.priority,
+            custom_labels=custom_labels,
         )
         return adapted_request, request

sglang/srt/entrypoints/openai/serving_embedding.py CHANGED Viewed

@@ -125,6 +125,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
         adapted_request = EmbeddingReqInput(
             **prompt_kwargs,
             rid=request.rid,
+            priority=request.priority,
         )
         return adapted_request, request

sglang/srt/entrypoints/openai/serving_responses.py CHANGED Viewed

@@ -123,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
         self.background_tasks: dict[str, asyncio.Task] = {}
+    # error helpers dedicated for v1/responses
+    def create_error_response(
+        self,
+        message: str,
+        err_type: str = "invalid_request_error",
+        status_code: int = 400,
+        param: Optional[str] = None,
+    ) -> ORJSONResponse:
+        nested_error = {
+            "message": message,
+            "type": err_type,
+            "param": param,
+            "code": status_code,
+        }
+        return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
+    def create_streaming_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+    ) -> str:
+        return json.dumps(
+            {
+                "error": {
+                    "message": message,
+                    "type": err_type,
+                    "param": None,
+                    "code": status_code,
+                }
+            }
+        )
     def _request_id_prefix(self) -> str:
         return "resp_"
@@ -245,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
                         sampling_params=sampling_params,
                         stream=request.stream,
                         rid=request.request_id,
+                        extra_key=self._compute_extra_key(request),
                         background=request.background,
                     )
@@ -833,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
         async for ctx in result_generator:
+            # Only process context objects that implement the `is_expecting_start()` method,
+            # which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
+            # Contexts without this method are skipped, as they do not represent a new turn
+            # or are not compatible with per-turn handling in the /v1/responses endpoint.
+            if not hasattr(ctx, "is_expecting_start"):
+                continue
             if ctx.is_expecting_start():
                 current_output_index += 1
                 sent_output_item_added = False
@@ -1250,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
                 sampling_params=sampling_params,
                 stream=adapted_request.stream,
                 rid=request_id,
+                extra_key=adapted_request.extra_key,
                 return_logprob=adapted_request.return_logprob,
                 logprob_start_len=adapted_request.logprob_start_len,
                 top_logprobs_num=adapted_request.top_logprobs_num,

sglang/srt/environ.py ADDED Viewed

@@ -0,0 +1,285 @@
+import os
+import subprocess
+import warnings
+from contextlib import ExitStack, contextmanager
+from typing import Any
+class EnvField:
+    def __init__(self, default: Any):
+        self.default = default
+        # NOTE: we use None to indicate whether the value is set or not
+        # If the value is manually set to None, we need mark it as _set_to_none.
+        # Always use clear() to reset the value, which leads to the default fallback.
+        self._set_to_none = False
+    def __set_name__(self, owner, name):
+        self.name = name
+    def parse(self, value: str) -> Any:
+        raise NotImplementedError()
+    def get(self) -> Any:
+        value = os.getenv(self.name)
+        if self._set_to_none:
+            assert value is None
+            return None
+        if value is None:
+            return self.default
+        try:
+            return self.parse(value)
+        except ValueError as e:
+            warnings.warn(
+                f'Invalid value for {self.name}: {e}, using default "{self.default}"'
+            )
+            return self.default
+    def is_set(self):
+        # NOTE: If None is manually set, it is considered as set.
+        return self.name in os.environ or self._set_to_none
+    def get_set_value_or(self, or_value: Any):
+        # NOTE: Ugly usage, but only way to get custom default value.
+        return self.get() if self.is_set() else or_value
+    def set(self, value: Any):
+        if value is None:
+            self._set_to_none = True
+            os.environ.pop(self.name, None)
+        else:
+            self._set_to_none = False
+            os.environ[self.name] = str(value)
+    @contextmanager
+    def override(self, value: Any):
+        backup_present = self.name in os.environ
+        backup_value = os.environ.get(self.name)
+        backup_set_to_none = self._set_to_none
+        self.set(value)
+        yield
+        if backup_present:
+            os.environ[self.name] = backup_value
+        else:
+            os.environ.pop(self.name, None)
+        self._set_to_none = backup_set_to_none
+    def clear(self):
+        os.environ.pop(self.name, None)
+        self._set_to_none = False
+    @property
+    def value(self):
+        return self.get()
+class EnvStr(EnvField):
+    def parse(self, value: str) -> str:
+        return value
+class EnvBool(EnvField):
+    def parse(self, value: str) -> bool:
+        value = value.lower()
+        if value in ["true", "1", "yes", "y"]:
+            return True
+        if value in ["false", "0", "no", "n"]:
+            return False
+        raise ValueError(f'"{value}" is not a valid boolean value')
+class EnvInt(EnvField):
+    def parse(self, value: str) -> int:
+        try:
+            return int(value)
+        except ValueError:
+            raise ValueError(f'"{value}" is not a valid integer value')
+class EnvFloat(EnvField):
+    def parse(self, value: str) -> float:
+        try:
+            return float(value)
+        except ValueError:
+            raise ValueError(f'"{value}" is not a valid float value')
+class Envs:
+    # fmt: off
+    # Model & File Download
+    SGLANG_USE_MODELSCOPE = EnvBool(False)
+    # Test & Debug
+    SGLANG_IS_IN_CI = EnvBool(False)
+    SGLANG_AMD_CI = EnvBool(False)
+    SGLANG_TEST_RETRACT = EnvBool(False)
+    SGLANG_SET_CPU_AFFINITY = EnvBool(False)
+    SGLANG_PROFILE_WITH_STACK = EnvBool(True)
+    SGLANG_RECORD_STEP_TIME = EnvBool(False)
+    SGLANG_GC_LOG = EnvBool(False)
+    SGLANG_FORCE_SHUTDOWN = EnvBool(False)
+    SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
+    SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
+    SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
+    SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
+    SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
+    SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
+    # Model Parallel
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
+    # Constrained Decoding
+    SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
+    SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
+    # Hi-Cache
+    SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
+    # Mooncake KV Transfer
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
+    ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
+    # AMD & ROCm
+    SGLANG_USE_AITER = EnvBool(False)
+    SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
+    # Quantization
+    SGLANG_INT4_WEIGHT = EnvBool(False)
+    SGLANG_CPU_QUANTIZATION = EnvBool(False)
+    SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
+    SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
+    # Flashinfer
+    SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
+    SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
+    # Triton
+    SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
+    # Torch Compile
+    SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
+    # EPLB
+    SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
+    SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
+    SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
+    SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
+    # TBO
+    SGLANG_TBO_DEBUG = EnvBool(False)
+    # DeepGemm
+    SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
+    SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
+    SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
+    SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
+    SGLANG_DG_USE_NVRTC = EnvBool(False)
+    SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
+    # sgl-kernel
+    SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
+    # vLLM dependencies
+    USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
+    USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
+    USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
+    RETURN_ORIGINAL_LOGPROB = EnvBool(False)
+    SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
+    SGLANG_MOE_PADDING = EnvBool(False)
+    SGLANG_CUTLASS_MOE = EnvBool(False)
+    HF_HUB_DISABLE_XET = EnvBool(False)
+    DISABLE_OPENAPI_DOC = EnvBool(False)
+    SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
+    SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
+    SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
+    SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
+    SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
+    # Deterministic inference
+    SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
+    SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
+    SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
+    SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
+    SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
+    # fmt: on
+envs = Envs()
+def _convert_SGL_to_SGLANG():
+    for key, value in os.environ.items():
+        if key.startswith("SGL_"):
+            new_key = key.replace("SGL_", "SGLANG_", 1)
+            warnings.warn(
+                f"Environment variable {key} is deprecated, please use {new_key}"
+            )
+            os.environ[new_key] = value
+_convert_SGL_to_SGLANG()
+def example_with_exit_stack():
+    # Use this style of context manager in unit test
+    exit_stack = ExitStack()
+    exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
+    assert envs.SGLANG_TEST_RETRACT.value is False
+    exit_stack.close()
+    assert envs.SGLANG_TEST_RETRACT.value is None
+def example_with_subprocess():
+    command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
+    with envs.SGLANG_TEST_RETRACT.override(True):
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        process.wait()
+        output = process.stdout.read().decode("utf-8").strip()
+        assert output == "True"
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output = process.stdout.read().decode("utf-8").strip()
+    assert output == "None"
+def examples():
+    # Example usage for envs
+    envs.SGLANG_TEST_RETRACT.clear()
+    assert envs.SGLANG_TEST_RETRACT.value is False
+    envs.SGLANG_TEST_RETRACT.set(None)
+    assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
+    envs.SGLANG_TEST_RETRACT.clear()
+    assert not envs.SGLANG_TEST_RETRACT.is_set()
+    envs.SGLANG_TEST_RETRACT.set(True)
+    assert envs.SGLANG_TEST_RETRACT.value is True
+    with envs.SGLANG_TEST_RETRACT.override(None):
+        assert (
+            envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
+        )
+    assert envs.SGLANG_TEST_RETRACT.value is True
+    envs.SGLANG_TEST_RETRACT.set(None)
+    with envs.SGLANG_TEST_RETRACT.override(True):
+        assert envs.SGLANG_TEST_RETRACT.value is True
+    assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
+    example_with_exit_stack()
+    example_with_subprocess()
+if __name__ == "__main__":
+    examples()

sglang/srt/eplb/expert_location.py CHANGED Viewed

@@ -231,6 +231,7 @@ class ExpertLocationMetadata:
             logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
             logical_to_rank_dispatch_physical_map=(
                 compute_logical_to_rank_dispatch_physical_map(
+                    server_args=server_args,
                     logical_to_all_physical_map=logical_to_all_physical_map,
                     num_gpus=ep_size,
                     num_physical_experts=num_physical_experts,
@@ -340,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
 # TODO optimize performance (rewrite and/or run in separate process with overlap)
 def compute_logical_to_rank_dispatch_physical_map(
+    server_args: ServerArgs,
     logical_to_all_physical_map: torch.Tensor,
     num_gpus: int,
     num_physical_experts: int,
@@ -348,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
 ):
     r = random.Random(seed)
-    num_local_physical_experts = num_physical_experts // num_gpus
+    num_local_gpu_physical_experts = num_physical_experts // num_gpus
+    num_gpus_per_node = server_args.ep_size // server_args.nnodes
+    num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
     num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
     dtype = logical_to_all_physical_map.dtype
@@ -372,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
                     physical_expert_id
                     for physical_expert_id in candidate_physical_expert_ids
                     if _compute_gpu_id_of_physical_expert(
-                        physical_expert_id, num_local_physical_experts
+                        physical_expert_id, num_local_gpu_physical_experts
                     )
                     == gpu_id
                 ]
                 if len(same_gpu_physical_expert_ids) > 0:
+                    # 1. Prefer same-GPU experts
                     output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
+                else:
+                    # 2. Otherwise, prefer same-node experts
+                    node_id = gpu_id // num_gpus_per_node
+                    same_node_physical_expert_ids = [
+                        physical_expert_id
+                        for physical_expert_id in candidate_physical_expert_ids
+                        if _compute_node_id_of_physical_expert(
+                            physical_expert_id, num_local_node_physical_experts
+                        )
+                        == node_id
+                    ]
+                    if len(same_node_physical_expert_ids) > 0:
+                        output_partial[gpu_id] = same_node_physical_expert_ids[0]
+            # 3. Fill remaining slots with fair random choices
             num_remain = torch.sum(output_partial == -1).item()
             output_partial[output_partial == -1] = torch.tensor(
                 _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
@@ -404,9 +423,15 @@ def _logical_to_all_physical_raw(
 def _compute_gpu_id_of_physical_expert(
-    physical_expert_id: int, num_local_physical_experts: int
+    physical_expert_id: int, num_local_gpu_physical_experts: int
+) -> int:
+    return physical_expert_id // num_local_gpu_physical_experts
+def _compute_node_id_of_physical_expert(
+    physical_expert_id: int, num_local_host_physical_experts: int
 ) -> int:
-    return physical_expert_id // num_local_physical_experts
+    return physical_expert_id // num_local_host_physical_experts
 def _fair_choices(arr: List, k: int, r: random.Random) -> List:

sglang/srt/function_call/function_call_parser.py CHANGED Viewed

@@ -20,6 +20,7 @@ from sglang.srt.function_call.pythonic_detector import PythonicDetector
 from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
 from sglang.srt.function_call.qwen25_detector import Qwen25Detector
 from sglang.srt.function_call.step3_detector import Step3Detector
+from sglang.srt.function_call.utils import get_json_schema_constraint
 logger = logging.getLogger(__name__)
@@ -178,8 +179,8 @@ class FunctionCallParser:
             strict_tag = self.get_structure_tag()
             return ("structural_tag", strict_tag)
         elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
-            ebnf = self.get_ebnf(tool_choice)
-            return ("ebnf", ebnf) if ebnf is not None else None
+            json_schema = get_json_schema_constraint(self.tools, tool_choice)
+            return ("json_schema", json_schema)
     def get_ebnf(
         self, tool_choice: Union[ToolChoice, Literal["required"]]

sglang/srt/function_call/glm4_moe_detector.py CHANGED Viewed

@@ -39,7 +39,7 @@ def parse_arguments(json_value):
 class Glm4MoeDetector(BaseFormatDetector):
     """
-    Detector for GLM-4.5 models.
+    Detector for GLM-4.5 and GLM-4.6 models.
     Assumes function call format:
       <tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
     """
@@ -53,7 +53,7 @@ class Glm4MoeDetector(BaseFormatDetector):
         self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
     def has_tool_call(self, text: str) -> bool:
-        """Check if the text contains a glm-4.5 format tool call."""
+        """Check if the text contains a glm-4.5 / glm-4.6 format tool call."""
         return self.bot_token in text
     def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
@@ -102,7 +102,7 @@ class Glm4MoeDetector(BaseFormatDetector):
         self, new_text: str, tools: List[Tool]
     ) -> StreamingParseResult:
         """
-        Streaming incremental parsing tool calls for GLM-4.5 format.
+        Streaming incremental parsing tool calls for GLM-4.5 and GLM-4.6 format.
         """
         self._buffer += new_text
         current_text = self._buffer

sglang/srt/function_call/gpt_oss_detector.py CHANGED Viewed

@@ -81,6 +81,29 @@ class GptOssDetector(BaseFormatDetector):
         # Always use HarmonyParser for parsing to ensure proper filtering
         events = self.harmony_parser.parse(new_text)
+        # If there are no parsed events and the chunk contains no Harmony structural
+        # markers, treat it as plain text and pass it through. This fixes a bug where
+        # normal content was held in the buffer when tools were provided but not used.
+        if not events:
+            has_harmony_markers = any(
+                marker in self._buffer
+                for marker in (
+                    "<|start|>",
+                    "<|channel|>",
+                    "<|message|>",
+                    "<|constrain|>",
+                    "<|end|>",
+                    "<|call|>",
+                    "<|return|>",
+                    "assistantfinal",
+                )
+            )
+            if not has_harmony_markers:
+                # Plain text with no tool markers — emit as normal content
+                out = self._buffer
+                self._buffer = ""
+                return StreamingParseResult(normal_text=out, calls=[])
         # Quick check if we might have tool calls
         if (
             "<|channel|>commentary to=" not in self._buffer

sglang/srt/function_call/json_array_parser.py ADDED Viewed

@@ -0,0 +1,63 @@
+import json
+import re
+from typing import List
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import StreamingParseResult
+class JsonArrayParser(BaseFormatDetector):
+    """
+    Parser for JSON array tool calls when JSON schema constraints are active.
+    This parser is used when tool_choice="required" or a specific tool is named,
+    bypassing model-specific parsers in favor of direct JSON array parsing.
+    """
+    def __init__(self):
+        super().__init__()
+        # Configure for JSON array parsing
+        self.bot_token = "["
+        self.eot_token = "]"
+        self.tool_call_separator = ","
+    def has_tool_call(self, text: str) -> bool:
+        """
+        Check if the given text contains a JSON tool call (array or single object).
+        """
+        return "[" in text or "{" in text
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        Parse JSON tool calls using the base class implementation.
+        """
+        raise NotImplementedError(
+            "Detect and parse not supported for JSON schema constraints."
+        )
+    def build_ebnf(self, tools: List[Tool]) -> str:
+        """
+        Build an EBNF grammar for constrained generation.
+        This is not used for JSON schema constraints as they are handled
+        by the constraint backends directly.
+        """
+        raise NotImplementedError(
+            "EBNF generation is not supported for JSON schema constraints."
+        )
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing with tool validation.
+        """
+        return super().parse_streaming_increment(new_text, tools)
+    def structure_info(self) -> callable:
+        """
+        Return a function that creates StructureInfo for constrained generation.
+        This is not used for JSON schema constraints as they are handled
+        by the constraint backends directly.
+        """
+        raise NotImplementedError("structure_info not used for JSON schema constraints")

sglang/srt/function_call/kimik2_detector.py CHANGED Viewed

@@ -50,6 +50,11 @@ class KimiK2Detector(BaseFormatDetector):
         self._last_arguments = ""
+        # Robust parser for ids like "functions.search:0" or fallback "search:0"
+        self.tool_call_id_regex = re.compile(
+            r"^(?:functions\.)?(?P<name>[\w\.]+):(?P<index>\d+)$"
+        )
     def has_tool_call(self, text: str) -> bool:
         """Check if the text contains a KimiK2 format tool call."""
         return self.bot_token in text
@@ -76,14 +81,18 @@ class KimiK2Detector(BaseFormatDetector):
             tool_calls = []
             for match in function_call_tuples:
                 function_id, function_args = match
-                function_name = function_id.split(".")[1].split(":")[0]
-                function_idx = int(function_id.split(".")[1].split(":")[1])
+                m = self.tool_call_id_regex.match(function_id)
+                if not m:
+                    logger.warning("Unexpected tool_call_id format: %s", function_id)
+                    continue
+                function_name = m.group("name")
+                function_idx = int(m.group("index"))
                 logger.info(f"function_name {function_name}")
                 tool_calls.append(
                     ToolCallItem(
-                        tool_index=function_idx,  # Use the call index in the response, not tool position
+                        tool_index=function_idx,
                         name=function_name,
                         parameters=function_args,
                     )
@@ -128,7 +137,11 @@ class KimiK2Detector(BaseFormatDetector):
                 function_id = match.group("tool_call_id")
                 function_args = match.group("function_arguments")
-                function_name = function_id.split(".")[1].split(":")[0]
+                m = self.tool_call_id_regex.match(function_id)
+                if not m:
+                    logger.warning("Unexpected tool_call_id format: %s", function_id)
+                    return StreamingParseResult(normal_text="", calls=calls)
+                function_name = m.group("name")
                 # Initialize state if this is the first tool call
                 if self.current_tool_id == -1:

sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl