sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
 - sglang/bench_one_batch_server.py +41 -25
 - sglang/bench_serving.py +378 -160
 - sglang/check_env.py +1 -1
 - sglang/compile_deep_gemm.py +6 -2
 - sglang/global_config.py +1 -25
 - sglang/lang/api.py +6 -0
 - sglang/lang/interpreter.py +1 -0
 - sglang/lang/ir.py +13 -0
 - sglang/launch_server.py +10 -15
 - sglang/profiler.py +18 -1
 - sglang/srt/_custom_ops.py +1 -1
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
 - sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
 - sglang/srt/compilation/backend.py +437 -0
 - sglang/srt/compilation/compilation_config.py +20 -0
 - sglang/srt/compilation/compilation_counter.py +47 -0
 - sglang/srt/compilation/compile.py +210 -0
 - sglang/srt/compilation/compiler_interface.py +503 -0
 - sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
 - sglang/srt/compilation/fix_functionalization.py +134 -0
 - sglang/srt/compilation/fx_utils.py +83 -0
 - sglang/srt/compilation/inductor_pass.py +140 -0
 - sglang/srt/compilation/pass_manager.py +66 -0
 - sglang/srt/compilation/piecewise_context_manager.py +40 -0
 - sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
 - sglang/srt/configs/__init__.py +4 -0
 - sglang/srt/configs/deepseek_ocr.py +262 -0
 - sglang/srt/configs/deepseekvl2.py +194 -96
 - sglang/srt/configs/dots_vlm.py +2 -7
 - sglang/srt/configs/falcon_h1.py +13 -64
 - sglang/srt/configs/load_config.py +25 -2
 - sglang/srt/configs/mamba_utils.py +117 -0
 - sglang/srt/configs/model_config.py +136 -25
 - sglang/srt/configs/modelopt_config.py +30 -0
 - sglang/srt/configs/nemotron_h.py +286 -0
 - sglang/srt/configs/olmo3.py +105 -0
 - sglang/srt/configs/points_v15_chat.py +29 -0
 - sglang/srt/configs/qwen3_next.py +11 -47
 - sglang/srt/configs/qwen3_omni.py +613 -0
 - sglang/srt/configs/qwen3_vl.py +0 -10
 - sglang/srt/connector/remote_instance.py +1 -1
 - sglang/srt/constrained/base_grammar_backend.py +5 -1
 - sglang/srt/constrained/llguidance_backend.py +5 -0
 - sglang/srt/constrained/outlines_backend.py +1 -1
 - sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
 - sglang/srt/constrained/utils.py +12 -0
 - sglang/srt/constrained/xgrammar_backend.py +20 -11
 - sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
 - sglang/srt/disaggregation/base/conn.py +17 -4
 - sglang/srt/disaggregation/common/conn.py +4 -2
 - sglang/srt/disaggregation/decode.py +123 -31
 - sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
 - sglang/srt/disaggregation/fake/conn.py +11 -3
 - sglang/srt/disaggregation/mooncake/conn.py +157 -19
 - sglang/srt/disaggregation/nixl/conn.py +69 -24
 - sglang/srt/disaggregation/prefill.py +96 -270
 - sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
 - sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
 - sglang/srt/distributed/device_communicators/pynccl.py +24 -12
 - sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
 - sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
 - sglang/srt/distributed/naive_distributed.py +5 -4
 - sglang/srt/distributed/parallel_state.py +63 -19
 - sglang/srt/elastic_ep/elastic_ep.py +74 -0
 - sglang/srt/entrypoints/context.py +3 -2
 - sglang/srt/entrypoints/engine.py +83 -80
 - sglang/srt/entrypoints/grpc_server.py +430 -234
 - sglang/srt/entrypoints/harmony_utils.py +2 -2
 - sglang/srt/entrypoints/http_server.py +195 -102
 - sglang/srt/entrypoints/http_server_engine.py +1 -7
 - sglang/srt/entrypoints/openai/protocol.py +225 -37
 - sglang/srt/entrypoints/openai/serving_base.py +49 -2
 - sglang/srt/entrypoints/openai/serving_chat.py +29 -74
 - sglang/srt/entrypoints/openai/serving_classify.py +204 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +15 -1
 - sglang/srt/entrypoints/openai/serving_responses.py +5 -2
 - sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
 - sglang/srt/environ.py +58 -6
 - sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
 - sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
 - sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
 - sglang/srt/eplb/expert_distribution.py +33 -4
 - sglang/srt/eplb/expert_location_dispatch.py +2 -2
 - sglang/srt/eplb/expert_location_updater.py +2 -2
 - sglang/srt/function_call/base_format_detector.py +17 -18
 - sglang/srt/function_call/function_call_parser.py +20 -14
 - sglang/srt/function_call/glm4_moe_detector.py +1 -5
 - sglang/srt/function_call/gpt_oss_detector.py +1 -1
 - sglang/srt/function_call/json_array_parser.py +0 -2
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/function_call/utils.py +2 -2
 - sglang/srt/grpc/compile_proto.py +3 -3
 - sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
 - sglang/srt/grpc/health_servicer.py +189 -0
 - sglang/srt/grpc/scheduler_launcher.py +181 -0
 - sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
 - sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
 - sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
 - sglang/srt/layers/activation.py +10 -1
 - sglang/srt/layers/attention/aiter_backend.py +3 -3
 - sglang/srt/layers/attention/ascend_backend.py +17 -1
 - sglang/srt/layers/attention/attention_registry.py +43 -23
 - sglang/srt/layers/attention/base_attn_backend.py +20 -1
 - sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
 - sglang/srt/layers/attention/fla/chunk.py +0 -1
 - sglang/srt/layers/attention/fla/chunk_o.py +1 -1
 - sglang/srt/layers/attention/fla/index.py +0 -2
 - sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
 - sglang/srt/layers/attention/fla/utils.py +0 -3
 - sglang/srt/layers/attention/fla/wy_fast.py +0 -2
 - sglang/srt/layers/attention/flashattention_backend.py +24 -10
 - sglang/srt/layers/attention/flashinfer_backend.py +258 -22
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
 - sglang/srt/layers/attention/flashmla_backend.py +2 -2
 - sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
 - sglang/srt/layers/attention/intel_amx_backend.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
 - sglang/srt/layers/attention/mamba/mamba.py +189 -241
 - sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
 - sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
 - sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
 - sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
 - sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
 - sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
 - sglang/srt/layers/attention/nsa/utils.py +0 -1
 - sglang/srt/layers/attention/nsa_backend.py +404 -90
 - sglang/srt/layers/attention/triton_backend.py +208 -34
 - sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
 - sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
 - sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
 - sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
 - sglang/srt/layers/attention/utils.py +89 -7
 - sglang/srt/layers/attention/vision.py +3 -3
 - sglang/srt/layers/attention/xpu_backend.py +1028 -0
 - sglang/srt/layers/communicator.py +12 -7
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
 - sglang/srt/layers/dp_attention.py +17 -0
 - sglang/srt/layers/layernorm.py +64 -19
 - sglang/srt/layers/linear.py +9 -1
 - sglang/srt/layers/logits_processor.py +152 -17
 - sglang/srt/layers/modelopt_utils.py +11 -0
 - sglang/srt/layers/moe/cutlass_moe.py +0 -2
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
 - sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
 - sglang/srt/layers/moe/ep_moe/layer.py +154 -625
 - sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
 - sglang/srt/layers/moe/moe_runner/runner.py +6 -0
 - sglang/srt/layers/moe/moe_runner/triton.py +3 -1
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
 - sglang/srt/layers/moe/router.py +51 -15
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
 - sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
 - sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
 - sglang/srt/layers/moe/topk.py +7 -6
 - sglang/srt/layers/moe/utils.py +20 -5
 - sglang/srt/layers/quantization/__init__.py +5 -58
 - sglang/srt/layers/quantization/awq.py +183 -9
 - sglang/srt/layers/quantization/awq_triton.py +29 -0
 - sglang/srt/layers/quantization/base_config.py +27 -1
 - sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
 - sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
 - sglang/srt/layers/quantization/fp8.py +152 -81
 - sglang/srt/layers/quantization/fp8_kernel.py +55 -10
 - sglang/srt/layers/quantization/fp8_utils.py +42 -14
 - sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/gptq.py +0 -1
 - sglang/srt/layers/quantization/int8_kernel.py +18 -2
 - sglang/srt/layers/quantization/marlin_utils.py +12 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +125 -100
 - sglang/srt/layers/quantization/mxfp4.py +35 -68
 - sglang/srt/layers/quantization/petit.py +1 -1
 - sglang/srt/layers/quantization/quark/quark.py +3 -1
 - sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
 - sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
 - sglang/srt/layers/quantization/unquant.py +23 -48
 - sglang/srt/layers/quantization/utils.py +0 -1
 - sglang/srt/layers/quantization/w4afp8.py +87 -20
 - sglang/srt/layers/quantization/w8a8_int8.py +30 -24
 - sglang/srt/layers/radix_attention.py +62 -9
 - sglang/srt/layers/rotary_embedding.py +686 -17
 - sglang/srt/layers/sampler.py +47 -16
 - sglang/srt/layers/sparse_pooler.py +98 -0
 - sglang/srt/layers/utils.py +0 -1
 - sglang/srt/layers/vocab_parallel_embedding.py +4 -1
 - sglang/srt/lora/backend/triton_backend.py +0 -1
 - sglang/srt/lora/eviction_policy.py +139 -0
 - sglang/srt/lora/lora_manager.py +24 -9
 - sglang/srt/lora/lora_registry.py +1 -1
 - sglang/srt/lora/mem_pool.py +40 -16
 - sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
 - sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
 - sglang/srt/managers/cache_controller.py +48 -17
 - sglang/srt/managers/data_parallel_controller.py +146 -42
 - sglang/srt/managers/detokenizer_manager.py +40 -13
 - sglang/srt/managers/io_struct.py +69 -16
 - sglang/srt/managers/mm_utils.py +20 -18
 - sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
 - sglang/srt/managers/overlap_utils.py +96 -19
 - sglang/srt/managers/schedule_batch.py +241 -511
 - sglang/srt/managers/schedule_policy.py +15 -2
 - sglang/srt/managers/scheduler.py +420 -514
 - sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
 - sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
 - sglang/srt/managers/scheduler_pp_mixin.py +341 -0
 - sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
 - sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
 - sglang/srt/managers/tokenizer_manager.py +375 -95
 - sglang/srt/managers/tp_worker.py +212 -161
 - sglang/srt/managers/utils.py +78 -2
 - sglang/srt/mem_cache/allocator.py +7 -2
 - sglang/srt/mem_cache/allocator_ascend.py +2 -2
 - sglang/srt/mem_cache/base_prefix_cache.py +2 -2
 - sglang/srt/mem_cache/chunk_cache.py +13 -2
 - sglang/srt/mem_cache/common.py +480 -0
 - sglang/srt/mem_cache/evict_policy.py +16 -1
 - sglang/srt/mem_cache/hicache_storage.py +11 -2
 - sglang/srt/mem_cache/hiradix_cache.py +16 -3
 - sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
 - sglang/srt/mem_cache/memory_pool.py +517 -219
 - sglang/srt/mem_cache/memory_pool_host.py +0 -1
 - sglang/srt/mem_cache/multimodal_cache.py +0 -1
 - sglang/srt/mem_cache/radix_cache.py +53 -19
 - sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
 - sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
 - sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
 - sglang/srt/mem_cache/storage/backend_factory.py +2 -2
 - sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
 - sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
 - sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
 - sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
 - sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
 - sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
 - sglang/srt/mem_cache/swa_radix_cache.py +92 -26
 - sglang/srt/metrics/collector.py +31 -0
 - sglang/srt/metrics/func_timer.py +1 -1
 - sglang/srt/model_executor/cuda_graph_runner.py +43 -5
 - sglang/srt/model_executor/forward_batch_info.py +71 -25
 - sglang/srt/model_executor/model_runner.py +362 -270
 - sglang/srt/model_executor/npu_graph_runner.py +2 -3
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
 - sglang/srt/model_loader/__init__.py +1 -1
 - sglang/srt/model_loader/loader.py +424 -27
 - sglang/srt/model_loader/utils.py +0 -1
 - sglang/srt/model_loader/weight_utils.py +47 -28
 - sglang/srt/models/apertus.py +2 -3
 - sglang/srt/models/arcee.py +2 -2
 - sglang/srt/models/bailing_moe.py +13 -52
 - sglang/srt/models/bailing_moe_nextn.py +3 -4
 - sglang/srt/models/bert.py +1 -1
 - sglang/srt/models/deepseek_nextn.py +19 -3
 - sglang/srt/models/deepseek_ocr.py +1516 -0
 - sglang/srt/models/deepseek_v2.py +418 -140
 - sglang/srt/models/dots_ocr.py +0 -2
 - sglang/srt/models/dots_vlm.py +0 -1
 - sglang/srt/models/dots_vlm_vit.py +1 -1
 - sglang/srt/models/falcon_h1.py +13 -19
 - sglang/srt/models/gemma3_mm.py +16 -0
 - sglang/srt/models/gemma3n_mm.py +1 -2
 - sglang/srt/models/glm4_moe.py +327 -382
 - sglang/srt/models/glm4_moe_nextn.py +6 -16
 - sglang/srt/models/glm4v.py +2 -1
 - sglang/srt/models/glm4v_moe.py +32 -199
 - sglang/srt/models/gpt_oss.py +5 -5
 - sglang/srt/models/grok.py +10 -23
 - sglang/srt/models/hunyuan.py +2 -7
 - sglang/srt/models/interns1.py +0 -1
 - sglang/srt/models/kimi_vl.py +1 -7
 - sglang/srt/models/kimi_vl_moonvit.py +3 -1
 - sglang/srt/models/llama.py +2 -2
 - sglang/srt/models/llama_eagle3.py +1 -1
 - sglang/srt/models/longcat_flash.py +5 -22
 - sglang/srt/models/longcat_flash_nextn.py +3 -14
 - sglang/srt/models/mimo.py +2 -13
 - sglang/srt/models/mimo_mtp.py +1 -2
 - sglang/srt/models/minicpmo.py +7 -5
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/mixtral.py +1 -4
 - sglang/srt/models/mllama.py +1 -1
 - sglang/srt/models/mllama4.py +13 -3
 - sglang/srt/models/nemotron_h.py +511 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/olmo2.py +31 -4
 - sglang/srt/models/opt.py +5 -5
 - sglang/srt/models/phi.py +1 -1
 - sglang/srt/models/phi4mm.py +1 -1
 - sglang/srt/models/phimoe.py +0 -1
 - sglang/srt/models/pixtral.py +0 -3
 - sglang/srt/models/points_v15_chat.py +186 -0
 - sglang/srt/models/qwen.py +0 -1
 - sglang/srt/models/qwen2.py +22 -1
 - sglang/srt/models/qwen2_5_vl.py +3 -3
 - sglang/srt/models/qwen2_audio.py +2 -15
 - sglang/srt/models/qwen2_moe.py +15 -12
 - sglang/srt/models/qwen2_vl.py +5 -2
 - sglang/srt/models/qwen3.py +34 -4
 - sglang/srt/models/qwen3_moe.py +19 -37
 - sglang/srt/models/qwen3_next.py +7 -12
 - sglang/srt/models/qwen3_next_mtp.py +3 -4
 - sglang/srt/models/qwen3_omni_moe.py +661 -0
 - sglang/srt/models/qwen3_vl.py +37 -33
 - sglang/srt/models/qwen3_vl_moe.py +57 -185
 - sglang/srt/models/roberta.py +55 -3
 - sglang/srt/models/sarashina2_vision.py +0 -1
 - sglang/srt/models/step3_vl.py +3 -5
 - sglang/srt/models/utils.py +11 -1
 - sglang/srt/multimodal/processors/base_processor.py +7 -2
 - sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
 - sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
 - sglang/srt/multimodal/processors/dots_vlm.py +0 -1
 - sglang/srt/multimodal/processors/glm4v.py +2 -6
 - sglang/srt/multimodal/processors/internvl.py +0 -2
 - sglang/srt/multimodal/processors/janus_pro.py +0 -1
 - sglang/srt/multimodal/processors/mllama4.py +0 -8
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/phi4mm.py +0 -1
 - sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
 - sglang/srt/multimodal/processors/qwen_vl.py +75 -16
 - sglang/srt/multimodal/processors/step3_vl.py +1 -1
 - sglang/srt/parser/conversation.py +41 -0
 - sglang/srt/parser/reasoning_parser.py +28 -2
 - sglang/srt/sampling/custom_logit_processor.py +77 -2
 - sglang/srt/sampling/sampling_batch_info.py +17 -22
 - sglang/srt/sampling/sampling_params.py +70 -2
 - sglang/srt/server_args.py +846 -163
 - sglang/srt/server_args_config_parser.py +1 -1
 - sglang/srt/single_batch_overlap.py +36 -31
 - sglang/srt/speculative/base_spec_worker.py +34 -0
 - sglang/srt/speculative/draft_utils.py +226 -0
 - sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
 - sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
 - sglang/srt/speculative/eagle_info.py +57 -18
 - sglang/srt/speculative/eagle_info_v2.py +458 -0
 - sglang/srt/speculative/eagle_utils.py +138 -0
 - sglang/srt/speculative/eagle_worker.py +83 -280
 - sglang/srt/speculative/eagle_worker_v2.py +702 -0
 - sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
 - sglang/srt/speculative/ngram_worker.py +12 -11
 - sglang/srt/speculative/spec_info.py +2 -0
 - sglang/srt/speculative/spec_utils.py +38 -3
 - sglang/srt/speculative/standalone_worker.py +4 -14
 - sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
 - sglang/srt/two_batch_overlap.py +28 -14
 - sglang/srt/utils/__init__.py +1 -1
 - sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
 - sglang/srt/utils/common.py +272 -82
 - sglang/srt/utils/hf_transformers_utils.py +44 -17
 - sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
 - sglang/srt/{offloader.py → utils/offloader.py} +4 -4
 - sglang/srt/utils/profile_merger.py +199 -0
 - sglang/test/attention/test_flashattn_backend.py +1 -1
 - sglang/test/attention/test_flashattn_mla_backend.py +0 -1
 - sglang/test/attention/test_prefix_chunk_info.py +0 -2
 - sglang/test/attention/test_trtllm_mla_backend.py +221 -53
 - sglang/test/few_shot_gsm8k_engine.py +2 -4
 - sglang/test/kit_matched_stop.py +157 -0
 - sglang/test/longbench_v2/__init__.py +1 -0
 - sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
 - sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
 - sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
 - sglang/test/run_eval.py +41 -0
 - sglang/test/runners.py +2 -0
 - sglang/test/send_one.py +42 -7
 - sglang/test/simple_eval_common.py +3 -0
 - sglang/test/simple_eval_gpqa.py +0 -1
 - sglang/test/simple_eval_humaneval.py +0 -3
 - sglang/test/simple_eval_longbench_v2.py +344 -0
 - sglang/test/test_block_fp8.py +1 -2
 - sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
 - sglang/test/test_cutlass_moe.py +1 -2
 - sglang/test/test_cutlass_w4a8_moe.py +10 -20
 - sglang/test/test_deterministic.py +463 -107
 - sglang/test/test_deterministic_utils.py +74 -0
 - sglang/test/test_disaggregation_utils.py +81 -0
 - sglang/test/test_marlin_moe.py +0 -1
 - sglang/test/test_utils.py +85 -20
 - sglang/version.py +1 -1
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
 - sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
 - sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
 - sglang/srt/models/vila.py +0 -306
 - sglang/srt/speculative/build_eagle_tree.py +0 -427
 - sglang/test/test_block_fp8_ep.py +0 -358
 - /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
 - /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
 - /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
 
| 
         @@ -1,8 +1,8 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            import json
         
     | 
| 
       2 
1 
     | 
    
         
             
            from json import JSONDecodeError, JSONDecoder
         
     | 
| 
       3 
2 
     | 
    
         
             
            from json.decoder import WHITESPACE
         
     | 
| 
       4 
3 
     | 
    
         
             
            from typing import Any, List, Literal, Optional, Tuple, Union
         
     | 
| 
       5 
4 
     | 
    
         | 
| 
      
 5 
     | 
    
         
            +
            import orjson
         
     | 
| 
       6 
6 
     | 
    
         
             
            import partial_json_parser
         
     | 
| 
       7 
7 
     | 
    
         
             
            from partial_json_parser.core.options import Allow
         
     | 
| 
       8 
8 
     | 
    
         | 
| 
         @@ -51,7 +51,7 @@ def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]: 
     | 
|
| 
       51 
51 
     | 
    
         | 
| 
       52 
52 
     | 
    
         
             
            def _is_complete_json(input_str: str) -> bool:
         
     | 
| 
       53 
53 
     | 
    
         
             
                try:
         
     | 
| 
       54 
     | 
    
         
            -
                     
     | 
| 
      
 54 
     | 
    
         
            +
                    orjson.loads(input_str)
         
     | 
| 
       55 
55 
     | 
    
         
             
                    return True
         
     | 
| 
       56 
56 
     | 
    
         
             
                except JSONDecodeError:
         
     | 
| 
       57 
57 
     | 
    
         
             
                    return False
         
     | 
    
        sglang/srt/grpc/compile_proto.py
    CHANGED
    
    | 
         @@ -16,7 +16,7 @@ Options: 
     | 
|
| 
       16 
16 
     | 
    
         
             
                --proto-file    Specify proto file (default: sglang_scheduler.proto)
         
     | 
| 
       17 
17 
     | 
    
         | 
| 
       18 
18 
     | 
    
         
             
            ### Install Dependencies
         
     | 
| 
       19 
     | 
    
         
            -
            pip install "grpcio==1. 
     | 
| 
      
 19 
     | 
    
         
            +
            pip install "grpcio==1.75.1" "grpcio-tools==1.75.1"
         
     | 
| 
       20 
20 
     | 
    
         | 
| 
       21 
21 
     | 
    
         
             
            ### Run Script
         
     | 
| 
       22 
22 
     | 
    
         
             
            cd python/sglang/srt/grpc
         
     | 
| 
         @@ -30,7 +30,7 @@ import sys 
     | 
|
| 
       30 
30 
     | 
    
         
             
            from importlib.metadata import version
         
     | 
| 
       31 
31 
     | 
    
         
             
            from pathlib import Path
         
     | 
| 
       32 
32 
     | 
    
         | 
| 
       33 
     | 
    
         
            -
            GRPC_VERSION = "1. 
     | 
| 
      
 33 
     | 
    
         
            +
            GRPC_VERSION = "1.75.1"
         
     | 
| 
       34 
34 
     | 
    
         | 
| 
       35 
35 
     | 
    
         | 
| 
       36 
36 
     | 
    
         
             
            def get_file_mtime(path: Path) -> float:
         
     | 
| 
         @@ -70,7 +70,7 @@ def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> b 
     | 
|
| 
       70 
70 
     | 
    
         | 
| 
       71 
71 
     | 
    
         
             
                # Check if grpc_tools is available
         
     | 
| 
       72 
72 
     | 
    
         
             
                try:
         
     | 
| 
       73 
     | 
    
         
            -
                    import grpc_tools.protoc
         
     | 
| 
      
 73 
     | 
    
         
            +
                    import grpc_tools.protoc  # noqa: F401
         
     | 
| 
       74 
74 
     | 
    
         
             
                except ImportError:
         
     | 
| 
       75 
75 
     | 
    
         
             
                    print("Error: grpcio-tools not installed")
         
     | 
| 
       76 
76 
     | 
    
         
             
                    print(
         
     | 
| 
         @@ -263,8 +263,8 @@ class GrpcRequestManager: 
     | 
|
| 
       263 
263 
     | 
    
         
             
                                    response = await task
         
     | 
| 
       264 
264 
     | 
    
         | 
| 
       265 
265 
     | 
    
         
             
                                    # Add index for client-side ordering
         
     | 
| 
       266 
     | 
    
         
            -
                                    if isinstance(response, dict) 
     | 
| 
       267 
     | 
    
         
            -
                                        response_rid = response 
     | 
| 
      
 266 
     | 
    
         
            +
                                    if isinstance(response, dict):
         
     | 
| 
      
 267 
     | 
    
         
            +
                                        response_rid = response.get("request_id", "")
         
     | 
| 
       268 
268 
     | 
    
         
             
                                        if response_rid in rid_to_index:
         
     | 
| 
       269 
269 
     | 
    
         
             
                                            response["index"] = rid_to_index[response_rid]
         
     | 
| 
       270 
270 
     | 
    
         | 
| 
         @@ -318,13 +318,8 @@ class GrpcRequestManager: 
     | 
|
| 
       318 
318 
     | 
    
         
             
                        is_stream = getattr(obj, "stream", False)
         
     | 
| 
       319 
319 
     | 
    
         | 
| 
       320 
320 
     | 
    
         
             
                        while True:
         
     | 
| 
       321 
     | 
    
         
            -
                            # Client cancelled - notify scheduler and exit
         
     | 
| 
       322 
     | 
    
         
            -
                            if grpc_context and grpc_context.cancelled():
         
     | 
| 
       323 
     | 
    
         
            -
                                await self.abort_request(request_id)
         
     | 
| 
       324 
     | 
    
         
            -
                                return
         
     | 
| 
       325 
     | 
    
         
            -
             
     | 
| 
       326 
321 
     | 
    
         
             
                            try:
         
     | 
| 
       327 
     | 
    
         
            -
                                response = await  
     | 
| 
      
 322 
     | 
    
         
            +
                                response = await state.out_queue.get()
         
     | 
| 
       328 
323 
     | 
    
         | 
| 
       329 
324 
     | 
    
         
             
                                if is_stream:
         
     | 
| 
       330 
325 
     | 
    
         
             
                                    yield response
         
     | 
| 
         @@ -337,13 +332,11 @@ class GrpcRequestManager: 
     | 
|
| 
       337 
332 
     | 
    
         
             
                                        yield final_response
         
     | 
| 
       338 
333 
     | 
    
         
             
                                    break
         
     | 
| 
       339 
334 
     | 
    
         | 
| 
       340 
     | 
    
         
            -
                            except asyncio. 
     | 
| 
       341 
     | 
    
         
            -
                                #  
     | 
| 
       342 
     | 
    
         
            -
                                logger. 
     | 
| 
       343 
     | 
    
         
            -
                                    f"Timeout waiting for response for request {request_id}"
         
     | 
| 
       344 
     | 
    
         
            -
                                )
         
     | 
| 
      
 335 
     | 
    
         
            +
                            except asyncio.CancelledError:
         
     | 
| 
      
 336 
     | 
    
         
            +
                                # Task was cancelled by gRPC framework when client disconnected
         
     | 
| 
      
 337 
     | 
    
         
            +
                                logger.info(f"Request {request_id} cancelled by client")
         
     | 
| 
       345 
338 
     | 
    
         
             
                                await self.abort_request(request_id)
         
     | 
| 
       346 
     | 
    
         
            -
                                 
     | 
| 
      
 339 
     | 
    
         
            +
                                raise  # Re-raise to let gRPC server handle cleanup
         
     | 
| 
       347 
340 
     | 
    
         | 
| 
       348 
341 
     | 
    
         
             
                    finally:
         
     | 
| 
       349 
342 
     | 
    
         
             
                        # Always clean up request state when exiting
         
     | 
| 
         @@ -397,9 +390,7 @@ class GrpcRequestManager: 
     | 
|
| 
       397 
390 
     | 
    
         
             
                    # Wait for result in background
         
     | 
| 
       398 
391 
     | 
    
         
             
                    async def wait_for_result():
         
     | 
| 
       399 
392 
     | 
    
         
             
                        try:
         
     | 
| 
       400 
     | 
    
         
            -
                            # Wait for completion
         
     | 
| 
       401 
393 
     | 
    
         
             
                            await state.event.wait()
         
     | 
| 
       402 
     | 
    
         
            -
                            # Get result from queue
         
     | 
| 
       403 
394 
     | 
    
         
             
                            result = await state.out_queue.get()
         
     | 
| 
       404 
395 
     | 
    
         
             
                            future.set_result(result)
         
     | 
| 
       405 
396 
     | 
    
         
             
                        except Exception as e:
         
     | 
| 
         @@ -413,43 +404,34 @@ class GrpcRequestManager: 
     | 
|
| 
       413 
404 
     | 
    
         
             
                    return future
         
     | 
| 
       414 
405 
     | 
    
         | 
| 
       415 
406 
     | 
    
         
             
                async def abort_request(self, request_id: str) -> bool:
         
     | 
| 
       416 
     | 
    
         
            -
                    """Abort a running request. 
     | 
| 
       417 
     | 
    
         
            -
                    if request_id not in self.rid_to_state:
         
     | 
| 
       418 
     | 
    
         
            -
                        return False
         
     | 
| 
      
 407 
     | 
    
         
            +
                    """Abort a running request.
         
     | 
| 
       419 
408 
     | 
    
         | 
| 
       420 
     | 
    
         
            -
                     
     | 
| 
       421 
     | 
    
         
            -
                     
     | 
| 
       422 
     | 
    
         
            -
                     
     | 
| 
       423 
     | 
    
         
            -
             
     | 
| 
       424 
     | 
    
         
            -
                     
     | 
| 
       425 
     | 
    
         
            -
                        logger.error(f"Failed to send abort request: {e}")
         
     | 
| 
      
 409 
     | 
    
         
            +
                    Sends abort request to scheduler and marks local state as finished
         
     | 
| 
      
 410 
     | 
    
         
            +
                    to stop processing any further outputs from the scheduler.
         
     | 
| 
      
 411 
     | 
    
         
            +
                    """
         
     | 
| 
      
 412 
     | 
    
         
            +
                    # Skip aborting health check requests (they clean themselves up)
         
     | 
| 
      
 413 
     | 
    
         
            +
                    if request_id.startswith("HEALTH_CHECK"):
         
     | 
| 
       426 
414 
     | 
    
         
             
                        return False
         
     | 
| 
       427 
415 
     | 
    
         | 
| 
       428 
     | 
    
         
            -
                    # Mark as finished
         
     | 
| 
      
 416 
     | 
    
         
            +
                    # Mark state as finished immediately to stop processing scheduler outputs
         
     | 
| 
       429 
417 
     | 
    
         
             
                    state = self.rid_to_state.get(request_id)
         
     | 
| 
       430 
418 
     | 
    
         
             
                    if state:
         
     | 
| 
       431 
419 
     | 
    
         
             
                        state.finished = True
         
     | 
| 
       432 
420 
     | 
    
         
             
                        state.stream_finished = True
         
     | 
| 
       433 
     | 
    
         
            -
                         
     | 
| 
      
 421 
     | 
    
         
            +
                        logger.debug(f"Marked request {request_id} as aborted locally")
         
     | 
| 
       434 
422 
     | 
    
         | 
| 
       435 
     | 
    
         
            -
             
     | 
| 
       436 
     | 
    
         
            -
             
     | 
| 
      
 423 
     | 
    
         
            +
                    # Send abort to scheduler - the scheduler will send AbortReq back
         
     | 
| 
      
 424 
     | 
    
         
            +
                    # which will be handled by _handle_abort_req
         
     | 
| 
      
 425 
     | 
    
         
            +
                    abort_req = AbortReq(rid=request_id)
         
     | 
| 
      
 426 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 427 
     | 
    
         
            +
                        await self._send_to_scheduler(abort_req)
         
     | 
| 
      
 428 
     | 
    
         
            +
                        logger.debug(f"Sent abort to scheduler for request {request_id}")
         
     | 
| 
      
 429 
     | 
    
         
            +
                    except Exception as e:
         
     | 
| 
      
 430 
     | 
    
         
            +
                        logger.error(f"Failed to send abort request to scheduler: {e}")
         
     | 
| 
      
 431 
     | 
    
         
            +
                        return False
         
     | 
| 
       437 
432 
     | 
    
         | 
| 
       438 
433 
     | 
    
         
             
                    return True
         
     | 
| 
       439 
434 
     | 
    
         | 
| 
       440 
     | 
    
         
            -
                async def pause_generation(self):
         
     | 
| 
       441 
     | 
    
         
            -
                    """Pause generation processing."""
         
     | 
| 
       442 
     | 
    
         
            -
                    async with self.is_pause_cond:
         
     | 
| 
       443 
     | 
    
         
            -
                        self.is_pause = True
         
     | 
| 
       444 
     | 
    
         
            -
                        logger.info("Generation paused")
         
     | 
| 
       445 
     | 
    
         
            -
             
     | 
| 
       446 
     | 
    
         
            -
                async def resume_generation(self):
         
     | 
| 
       447 
     | 
    
         
            -
                    """Resume generation processing."""
         
     | 
| 
       448 
     | 
    
         
            -
                    async with self.is_pause_cond:
         
     | 
| 
       449 
     | 
    
         
            -
                        self.is_pause = False
         
     | 
| 
       450 
     | 
    
         
            -
                        self.is_pause_cond.notify_all()
         
     | 
| 
       451 
     | 
    
         
            -
                        logger.info("Generation resumed")
         
     | 
| 
       452 
     | 
    
         
            -
             
     | 
| 
       453 
435 
     | 
    
         
             
                async def handle_loop(self):
         
     | 
| 
       454 
436 
     | 
    
         
             
                    """
         
     | 
| 
       455 
437 
     | 
    
         
             
                    Main event loop - processes outputs from scheduler.
         
     | 
| 
         @@ -461,10 +443,11 @@ class GrpcRequestManager: 
     | 
|
| 
       461 
443 
     | 
    
         
             
                            recv_obj = await self.recv_from_scheduler.recv_pyobj()
         
     | 
| 
       462 
444 
     | 
    
         
             
                            self.last_receive_tstamp = time.time()
         
     | 
| 
       463 
445 
     | 
    
         | 
| 
       464 
     | 
    
         
            -
                            # Check for pause
         
     | 
| 
       465 
     | 
    
         
            -
                             
     | 
| 
       466 
     | 
    
         
            -
                                 
     | 
| 
       467 
     | 
    
         
            -
                                     
     | 
| 
      
 446 
     | 
    
         
            +
                            # Check for pause (optimized: check flag before acquiring lock)
         
     | 
| 
      
 447 
     | 
    
         
            +
                            if self.is_pause:
         
     | 
| 
      
 448 
     | 
    
         
            +
                                async with self.is_pause_cond:
         
     | 
| 
      
 449 
     | 
    
         
            +
                                    while self.is_pause:
         
     | 
| 
      
 450 
     | 
    
         
            +
                                        await self.is_pause_cond.wait()
         
     | 
| 
       468 
451 
     | 
    
         | 
| 
       469 
452 
     | 
    
         
             
                            # Handle different output types
         
     | 
| 
       470 
453 
     | 
    
         
             
                            if isinstance(recv_obj, BatchTokenIDOutput):
         
     | 
| 
         @@ -473,6 +456,8 @@ class GrpcRequestManager: 
     | 
|
| 
       473 
456 
     | 
    
         
             
                                await self._handle_embedding_output(recv_obj)
         
     | 
| 
       474 
457 
     | 
    
         
             
                            elif isinstance(recv_obj, HealthCheckOutput):
         
     | 
| 
       475 
458 
     | 
    
         
             
                                await self._handle_health_check_output(recv_obj)
         
     | 
| 
      
 459 
     | 
    
         
            +
                            elif isinstance(recv_obj, AbortReq):
         
     | 
| 
      
 460 
     | 
    
         
            +
                                await self._handle_abort_req(recv_obj)
         
     | 
| 
       476 
461 
     | 
    
         
             
                            else:
         
     | 
| 
       477 
462 
     | 
    
         
             
                                logger.warning(f"Unknown output type: {type(recv_obj)}")
         
     | 
| 
       478 
463 
     | 
    
         | 
| 
         @@ -547,6 +532,11 @@ class GrpcRequestManager: 
     | 
|
| 
       547 
532 
     | 
    
         | 
| 
       548 
533 
     | 
    
         
             
                async def _handle_batch_output(self, batch_out: BatchTokenIDOutput):
         
     | 
| 
       549 
534 
     | 
    
         
             
                    """Handle batch generation output from scheduler."""
         
     | 
| 
      
 535 
     | 
    
         
            +
                    # Collect all queue.put() tasks for parallel execution
         
     | 
| 
      
 536 
     | 
    
         
            +
                    put_tasks = []
         
     | 
| 
      
 537 
     | 
    
         
            +
                    cleanup_tasks = []
         
     | 
| 
      
 538 
     | 
    
         
            +
                    now = time.time()
         
     | 
| 
      
 539 
     | 
    
         
            +
             
     | 
| 
       550 
540 
     | 
    
         
             
                    # Process each request in the batch
         
     | 
| 
       551 
541 
     | 
    
         
             
                    for i, rid in enumerate(batch_out.rids):
         
     | 
| 
       552 
542 
     | 
    
         
             
                        if rid not in self.rid_to_state:
         
     | 
| 
         @@ -554,8 +544,12 @@ class GrpcRequestManager: 
     | 
|
| 
       554 
544 
     | 
    
         | 
| 
       555 
545 
     | 
    
         
             
                        state = self.rid_to_state[rid]
         
     | 
| 
       556 
546 
     | 
    
         | 
| 
      
 547 
     | 
    
         
            +
                        # Skip if already aborted/finished locally (client cancelled)
         
     | 
| 
      
 548 
     | 
    
         
            +
                        if state.finished:
         
     | 
| 
      
 549 
     | 
    
         
            +
                            logger.debug(f"Skipping output for aborted request {rid}")
         
     | 
| 
      
 550 
     | 
    
         
            +
                            continue
         
     | 
| 
      
 551 
     | 
    
         
            +
             
     | 
| 
       557 
552 
     | 
    
         
             
                        # Update metrics
         
     | 
| 
       558 
     | 
    
         
            -
                        now = time.time()
         
     | 
| 
       559 
553 
     | 
    
         
             
                        if state.first_token_time == 0.0:
         
     | 
| 
       560 
554 
     | 
    
         
             
                            state.first_token_time = now
         
     | 
| 
       561 
555 
     | 
    
         
             
                        state.last_time = now
         
     | 
| 
         @@ -649,7 +643,8 @@ class GrpcRequestManager: 
     | 
|
| 
       649 
643 
     | 
    
         
             
                        if output_data["token_ids"]:
         
     | 
| 
       650 
644 
     | 
    
         
             
                            state.output_ids.extend(output_data["token_ids"])
         
     | 
| 
       651 
645 
     | 
    
         | 
| 
       652 
     | 
    
         
            -
                         
     | 
| 
      
 646 
     | 
    
         
            +
                        # Add queue.put() to parallel task list
         
     | 
| 
      
 647 
     | 
    
         
            +
                        put_tasks.append(state.out_queue.put(output_data))
         
     | 
| 
       653 
648 
     | 
    
         | 
| 
       654 
649 
     | 
    
         
             
                        # Handle completion
         
     | 
| 
       655 
650 
     | 
    
         
             
                        if output_data["finished"]:
         
     | 
| 
         @@ -659,12 +654,16 @@ class GrpcRequestManager: 
     | 
|
| 
       659 
654 
     | 
    
         
             
                            state.event.set()
         
     | 
| 
       660 
655 
     | 
    
         | 
| 
       661 
656 
     | 
    
         
             
                            # Remove from tracking after a delay
         
     | 
| 
       662 
     | 
    
         
            -
                            async def cleanup():
         
     | 
| 
      
 657 
     | 
    
         
            +
                            async def cleanup(request_id):
         
     | 
| 
       663 
658 
     | 
    
         
             
                                await asyncio.sleep(5.0)
         
     | 
| 
       664 
     | 
    
         
            -
                                if  
     | 
| 
       665 
     | 
    
         
            -
                                    del self.rid_to_state[ 
     | 
| 
      
 659 
     | 
    
         
            +
                                if request_id in self.rid_to_state:
         
     | 
| 
      
 660 
     | 
    
         
            +
                                    del self.rid_to_state[request_id]
         
     | 
| 
       666 
661 
     | 
    
         | 
| 
       667 
     | 
    
         
            -
                            asyncio.create_task(cleanup())
         
     | 
| 
      
 662 
     | 
    
         
            +
                            cleanup_tasks.append(asyncio.create_task(cleanup(rid)))
         
     | 
| 
      
 663 
     | 
    
         
            +
             
     | 
| 
      
 664 
     | 
    
         
            +
                    # Execute all queue.put() operations in parallel
         
     | 
| 
      
 665 
     | 
    
         
            +
                    if put_tasks:
         
     | 
| 
      
 666 
     | 
    
         
            +
                        await asyncio.gather(*put_tasks, return_exceptions=True)
         
     | 
| 
       668 
667 
     | 
    
         | 
| 
       669 
668 
     | 
    
         
             
                async def _handle_embedding_output(self, batch_out: BatchEmbeddingOutput):
         
     | 
| 
       670 
669 
     | 
    
         
             
                    """Handle batch embedding output from scheduler."""
         
     | 
| 
         @@ -726,6 +725,67 @@ class GrpcRequestManager: 
     | 
|
| 
       726 
725 
     | 
    
         
             
                    state.finished_time = time.time()
         
     | 
| 
       727 
726 
     | 
    
         
             
                    state.event.set()
         
     | 
| 
       728 
727 
     | 
    
         | 
| 
      
 728 
     | 
    
         
            +
                async def _handle_abort_req(self, recv_obj: AbortReq):
         
     | 
| 
      
 729 
     | 
    
         
            +
                    """Handle abort request from scheduler.
         
     | 
| 
      
 730 
     | 
    
         
            +
             
     | 
| 
      
 731 
     | 
    
         
            +
                    The scheduler sends AbortReq back to notify us that a request was aborted,
         
     | 
| 
      
 732 
     | 
    
         
            +
                    either due to explicit abort_request() call or scheduler-initiated abort
         
     | 
| 
      
 733 
     | 
    
         
            +
                    (priority preemption, queue full, KV cache pressure, etc).
         
     | 
| 
      
 734 
     | 
    
         
            +
                    """
         
     | 
| 
      
 735 
     | 
    
         
            +
                    # Skip health check requests
         
     | 
| 
      
 736 
     | 
    
         
            +
                    if recv_obj.rid.startswith("HEALTH_CHECK"):
         
     | 
| 
      
 737 
     | 
    
         
            +
                        return
         
     | 
| 
      
 738 
     | 
    
         
            +
             
     | 
| 
      
 739 
     | 
    
         
            +
                    # Check if request still exists
         
     | 
| 
      
 740 
     | 
    
         
            +
                    if recv_obj.rid not in self.rid_to_state:
         
     | 
| 
      
 741 
     | 
    
         
            +
                        logger.debug(
         
     | 
| 
      
 742 
     | 
    
         
            +
                            f"Abort request for {recv_obj.rid} not in local state (may have already finished or not started yet)"
         
     | 
| 
      
 743 
     | 
    
         
            +
                        )
         
     | 
| 
      
 744 
     | 
    
         
            +
                        return
         
     | 
| 
      
 745 
     | 
    
         
            +
             
     | 
| 
      
 746 
     | 
    
         
            +
                    state = self.rid_to_state[recv_obj.rid]
         
     | 
| 
      
 747 
     | 
    
         
            +
             
     | 
| 
      
 748 
     | 
    
         
            +
                    # Mark as finished
         
     | 
| 
      
 749 
     | 
    
         
            +
                    state.finished = True
         
     | 
| 
      
 750 
     | 
    
         
            +
                    state.stream_finished = True
         
     | 
| 
      
 751 
     | 
    
         
            +
             
     | 
| 
      
 752 
     | 
    
         
            +
                    # Create abort response
         
     | 
| 
      
 753 
     | 
    
         
            +
                    if recv_obj.finished_reason:
         
     | 
| 
      
 754 
     | 
    
         
            +
                        # Scheduler provided a specific finish reason (e.g., priority preemption, queue full)
         
     | 
| 
      
 755 
     | 
    
         
            +
                        abort_response = {
         
     | 
| 
      
 756 
     | 
    
         
            +
                            "request_id": recv_obj.rid,
         
     | 
| 
      
 757 
     | 
    
         
            +
                            "error": recv_obj.finished_reason.get("message", "Request aborted"),
         
     | 
| 
      
 758 
     | 
    
         
            +
                            "finished": True,
         
     | 
| 
      
 759 
     | 
    
         
            +
                            "meta_info": {
         
     | 
| 
      
 760 
     | 
    
         
            +
                                "id": recv_obj.rid,
         
     | 
| 
      
 761 
     | 
    
         
            +
                                "finish_reason": recv_obj.finished_reason,
         
     | 
| 
      
 762 
     | 
    
         
            +
                            },
         
     | 
| 
      
 763 
     | 
    
         
            +
                        }
         
     | 
| 
      
 764 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 765 
     | 
    
         
            +
                        # Generic abort (e.g., explicit abort_request call)
         
     | 
| 
      
 766 
     | 
    
         
            +
                        abort_response = {
         
     | 
| 
      
 767 
     | 
    
         
            +
                            "request_id": recv_obj.rid,
         
     | 
| 
      
 768 
     | 
    
         
            +
                            "error": "Request aborted",
         
     | 
| 
      
 769 
     | 
    
         
            +
                            "finished": True,
         
     | 
| 
      
 770 
     | 
    
         
            +
                            "meta_info": {
         
     | 
| 
      
 771 
     | 
    
         
            +
                                "id": recv_obj.rid,
         
     | 
| 
      
 772 
     | 
    
         
            +
                                "finish_reason": {
         
     | 
| 
      
 773 
     | 
    
         
            +
                                    "type": "abort",
         
     | 
| 
      
 774 
     | 
    
         
            +
                                    "message": "Abort before prefill",
         
     | 
| 
      
 775 
     | 
    
         
            +
                                },
         
     | 
| 
      
 776 
     | 
    
         
            +
                                "prompt_tokens": 0,
         
     | 
| 
      
 777 
     | 
    
         
            +
                                "completion_tokens": 0,
         
     | 
| 
      
 778 
     | 
    
         
            +
                            },
         
     | 
| 
      
 779 
     | 
    
         
            +
                        }
         
     | 
| 
      
 780 
     | 
    
         
            +
             
     | 
| 
      
 781 
     | 
    
         
            +
                    # Send abort notification to output queue
         
     | 
| 
      
 782 
     | 
    
         
            +
                    await state.out_queue.put(abort_response)
         
     | 
| 
      
 783 
     | 
    
         
            +
             
     | 
| 
      
 784 
     | 
    
         
            +
                    # Wake up any waiting coroutines
         
     | 
| 
      
 785 
     | 
    
         
            +
                    state.event.set()
         
     | 
| 
      
 786 
     | 
    
         
            +
             
     | 
| 
      
 787 
     | 
    
         
            +
                    logger.debug(f"Handled abort request for {recv_obj.rid}")
         
     | 
| 
      
 788 
     | 
    
         
            +
             
     | 
| 
       729 
789 
     | 
    
         
             
                async def _send_to_scheduler(self, obj):
         
     | 
| 
       730 
790 
     | 
    
         
             
                    """Send an object to the scheduler via ZMQ."""
         
     | 
| 
       731 
791 
     | 
    
         
             
                    try:
         
     | 
| 
         @@ -0,0 +1,189 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            """
         
     | 
| 
      
 2 
     | 
    
         
            +
            Standard gRPC health check service implementation for Kubernetes probes.
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            This module implements the grpc.health.v1.Health service protocol, enabling
         
     | 
| 
      
 5 
     | 
    
         
            +
            native Kubernetes gRPC health probes for liveness and readiness checks.
         
     | 
| 
      
 6 
     | 
    
         
            +
            """
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            import logging
         
     | 
| 
      
 9 
     | 
    
         
            +
            import time
         
     | 
| 
      
 10 
     | 
    
         
            +
            from typing import AsyncIterator
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            import grpc
         
     | 
| 
      
 13 
     | 
    
         
            +
            from grpc_health.v1 import health_pb2, health_pb2_grpc
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            logger = logging.getLogger(__name__)
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            class SGLangHealthServicer(health_pb2_grpc.HealthServicer):
         
     | 
| 
      
 19 
     | 
    
         
            +
                """
         
     | 
| 
      
 20 
     | 
    
         
            +
                Standard gRPC health check service implementation for Kubernetes probes.
         
     | 
| 
      
 21 
     | 
    
         
            +
                Implements grpc.health.v1.Health protocol.
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                Supports two service levels:
         
     | 
| 
      
 24 
     | 
    
         
            +
                1. Overall server health (service="") - for liveness probes
         
     | 
| 
      
 25 
     | 
    
         
            +
                2. SGLang service health (service="sglang.grpc.scheduler.SglangScheduler") - for readiness probes
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                Health status lifecycle:
         
     | 
| 
      
 28 
     | 
    
         
            +
                - NOT_SERVING: Initial state, model loading, or shutting down
         
     | 
| 
      
 29 
     | 
    
         
            +
                - SERVING: Model loaded and ready to serve requests
         
     | 
| 
      
 30 
     | 
    
         
            +
                """
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                # Service names we support
         
     | 
| 
      
 33 
     | 
    
         
            +
                OVERALL_SERVER = ""  # Empty string for overall server health
         
     | 
| 
      
 34 
     | 
    
         
            +
                SGLANG_SERVICE = "sglang.grpc.scheduler.SglangScheduler"
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                def __init__(self, request_manager, scheduler_info: dict):
         
     | 
| 
      
 37 
     | 
    
         
            +
                    """
         
     | 
| 
      
 38 
     | 
    
         
            +
                    Initialize health servicer.
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 41 
     | 
    
         
            +
                        request_manager: GrpcRequestManager instance for checking server state
         
     | 
| 
      
 42 
     | 
    
         
            +
                        scheduler_info: Dict containing scheduler metadata
         
     | 
| 
      
 43 
     | 
    
         
            +
                    """
         
     | 
| 
      
 44 
     | 
    
         
            +
                    self.request_manager = request_manager
         
     | 
| 
      
 45 
     | 
    
         
            +
                    self.scheduler_info = scheduler_info
         
     | 
| 
      
 46 
     | 
    
         
            +
                    self._serving_status = {}
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                    # Initially set to NOT_SERVING until model is loaded
         
     | 
| 
      
 49 
     | 
    
         
            +
                    self._serving_status[self.OVERALL_SERVER] = (
         
     | 
| 
      
 50 
     | 
    
         
            +
                        health_pb2.HealthCheckResponse.NOT_SERVING
         
     | 
| 
      
 51 
     | 
    
         
            +
                    )
         
     | 
| 
      
 52 
     | 
    
         
            +
                    self._serving_status[self.SGLANG_SERVICE] = (
         
     | 
| 
      
 53 
     | 
    
         
            +
                        health_pb2.HealthCheckResponse.NOT_SERVING
         
     | 
| 
      
 54 
     | 
    
         
            +
                    )
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                    logger.info("Standard gRPC health service initialized")
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
                def set_serving(self):
         
     | 
| 
      
 59 
     | 
    
         
            +
                    """Mark services as SERVING - call this after model is loaded."""
         
     | 
| 
      
 60 
     | 
    
         
            +
                    self._serving_status[self.OVERALL_SERVER] = (
         
     | 
| 
      
 61 
     | 
    
         
            +
                        health_pb2.HealthCheckResponse.SERVING
         
     | 
| 
      
 62 
     | 
    
         
            +
                    )
         
     | 
| 
      
 63 
     | 
    
         
            +
                    self._serving_status[self.SGLANG_SERVICE] = (
         
     | 
| 
      
 64 
     | 
    
         
            +
                        health_pb2.HealthCheckResponse.SERVING
         
     | 
| 
      
 65 
     | 
    
         
            +
                    )
         
     | 
| 
      
 66 
     | 
    
         
            +
                    logger.info("Health service status set to SERVING")
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                def set_not_serving(self):
         
     | 
| 
      
 69 
     | 
    
         
            +
                    """Mark services as NOT_SERVING - call this during shutdown."""
         
     | 
| 
      
 70 
     | 
    
         
            +
                    self._serving_status[self.OVERALL_SERVER] = (
         
     | 
| 
      
 71 
     | 
    
         
            +
                        health_pb2.HealthCheckResponse.NOT_SERVING
         
     | 
| 
      
 72 
     | 
    
         
            +
                    )
         
     | 
| 
      
 73 
     | 
    
         
            +
                    self._serving_status[self.SGLANG_SERVICE] = (
         
     | 
| 
      
 74 
     | 
    
         
            +
                        health_pb2.HealthCheckResponse.NOT_SERVING
         
     | 
| 
      
 75 
     | 
    
         
            +
                    )
         
     | 
| 
      
 76 
     | 
    
         
            +
                    logger.info("Health service status set to NOT_SERVING")
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                async def Check(
         
     | 
| 
      
 79 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 80 
     | 
    
         
            +
                    request: health_pb2.HealthCheckRequest,
         
     | 
| 
      
 81 
     | 
    
         
            +
                    context: grpc.aio.ServicerContext,
         
     | 
| 
      
 82 
     | 
    
         
            +
                ) -> health_pb2.HealthCheckResponse:
         
     | 
| 
      
 83 
     | 
    
         
            +
                    """
         
     | 
| 
      
 84 
     | 
    
         
            +
                    Standard health check for Kubernetes probes.
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 87 
     | 
    
         
            +
                        request: Contains service name ("" for overall, or specific service)
         
     | 
| 
      
 88 
     | 
    
         
            +
                        context: gRPC context
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
                    Returns:
         
     | 
| 
      
 91 
     | 
    
         
            +
                        HealthCheckResponse with SERVING/NOT_SERVING/SERVICE_UNKNOWN status
         
     | 
| 
      
 92 
     | 
    
         
            +
                    """
         
     | 
| 
      
 93 
     | 
    
         
            +
                    service_name = request.service
         
     | 
| 
      
 94 
     | 
    
         
            +
                    logger.debug(f"Health check request for service: '{service_name}'")
         
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
                    # Check if shutting down
         
     | 
| 
      
 97 
     | 
    
         
            +
                    if self.request_manager.gracefully_exit:
         
     | 
| 
      
 98 
     | 
    
         
            +
                        logger.debug("Health check: Server is shutting down")
         
     | 
| 
      
 99 
     | 
    
         
            +
                        return health_pb2.HealthCheckResponse(
         
     | 
| 
      
 100 
     | 
    
         
            +
                            status=health_pb2.HealthCheckResponse.NOT_SERVING
         
     | 
| 
      
 101 
     | 
    
         
            +
                        )
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
                    # Overall server health - just check if process is alive
         
     | 
| 
      
 104 
     | 
    
         
            +
                    if service_name == self.OVERALL_SERVER:
         
     | 
| 
      
 105 
     | 
    
         
            +
                        status = self._serving_status.get(
         
     | 
| 
      
 106 
     | 
    
         
            +
                            self.OVERALL_SERVER, health_pb2.HealthCheckResponse.NOT_SERVING
         
     | 
| 
      
 107 
     | 
    
         
            +
                        )
         
     | 
| 
      
 108 
     | 
    
         
            +
                        logger.debug(
         
     | 
| 
      
 109 
     | 
    
         
            +
                            f"Overall health check: {health_pb2.HealthCheckResponse.ServingStatus.Name(status)}"
         
     | 
| 
      
 110 
     | 
    
         
            +
                        )
         
     | 
| 
      
 111 
     | 
    
         
            +
                        return health_pb2.HealthCheckResponse(status=status)
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                    # Specific service health - check if ready to serve
         
     | 
| 
      
 114 
     | 
    
         
            +
                    elif service_name == self.SGLANG_SERVICE:
         
     | 
| 
      
 115 
     | 
    
         
            +
                        # Additional checks for service readiness
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
                        # Check base status first
         
     | 
| 
      
 118 
     | 
    
         
            +
                        base_status = self._serving_status.get(
         
     | 
| 
      
 119 
     | 
    
         
            +
                            self.SGLANG_SERVICE, health_pb2.HealthCheckResponse.NOT_SERVING
         
     | 
| 
      
 120 
     | 
    
         
            +
                        )
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
                        if base_status != health_pb2.HealthCheckResponse.SERVING:
         
     | 
| 
      
 123 
     | 
    
         
            +
                            logger.debug("Service health check: NOT_SERVING (base status)")
         
     | 
| 
      
 124 
     | 
    
         
            +
                            return health_pb2.HealthCheckResponse(status=base_status)
         
     | 
| 
      
 125 
     | 
    
         
            +
             
     | 
| 
      
 126 
     | 
    
         
            +
                        # Check if scheduler is responsive (received data recently)
         
     | 
| 
      
 127 
     | 
    
         
            +
                        time_since_last_receive = (
         
     | 
| 
      
 128 
     | 
    
         
            +
                            time.time() - self.request_manager.last_receive_tstamp
         
     | 
| 
      
 129 
     | 
    
         
            +
                        )
         
     | 
| 
      
 130 
     | 
    
         
            +
             
     | 
| 
      
 131 
     | 
    
         
            +
                        # If no recent activity and we have active requests, might be stuck
         
     | 
| 
      
 132 
     | 
    
         
            +
                        # NOTE: 30s timeout is hardcoded. This is more conservative than
         
     | 
| 
      
 133 
     | 
    
         
            +
                        # HEALTH_CHECK_TIMEOUT (20s) used for custom HealthCheck RPC.
         
     | 
| 
      
 134 
     | 
    
         
            +
                        # Consider making this configurable via environment variable in the future
         
     | 
| 
      
 135 
     | 
    
         
            +
                        # if different workloads need different responsiveness thresholds.
         
     | 
| 
      
 136 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 137 
     | 
    
         
            +
                            time_since_last_receive > 30
         
     | 
| 
      
 138 
     | 
    
         
            +
                            and len(self.request_manager.rid_to_state) > 0
         
     | 
| 
      
 139 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 140 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 141 
     | 
    
         
            +
                                f"Service health check: Scheduler not responsive "
         
     | 
| 
      
 142 
     | 
    
         
            +
                                f"({time_since_last_receive:.1f}s since last receive, "
         
     | 
| 
      
 143 
     | 
    
         
            +
                                f"{len(self.request_manager.rid_to_state)} pending requests)"
         
     | 
| 
      
 144 
     | 
    
         
            +
                            )
         
     | 
| 
      
 145 
     | 
    
         
            +
                            return health_pb2.HealthCheckResponse(
         
     | 
| 
      
 146 
     | 
    
         
            +
                                status=health_pb2.HealthCheckResponse.NOT_SERVING
         
     | 
| 
      
 147 
     | 
    
         
            +
                            )
         
     | 
| 
      
 148 
     | 
    
         
            +
             
     | 
| 
      
 149 
     | 
    
         
            +
                        logger.debug("Service health check: SERVING")
         
     | 
| 
      
 150 
     | 
    
         
            +
                        return health_pb2.HealthCheckResponse(
         
     | 
| 
      
 151 
     | 
    
         
            +
                            status=health_pb2.HealthCheckResponse.SERVING
         
     | 
| 
      
 152 
     | 
    
         
            +
                        )
         
     | 
| 
      
 153 
     | 
    
         
            +
             
     | 
| 
      
 154 
     | 
    
         
            +
                    # Unknown service
         
     | 
| 
      
 155 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 156 
     | 
    
         
            +
                        logger.debug(f"Health check for unknown service: '{service_name}'")
         
     | 
| 
      
 157 
     | 
    
         
            +
                        context.set_code(grpc.StatusCode.NOT_FOUND)
         
     | 
| 
      
 158 
     | 
    
         
            +
                        context.set_details(f"Unknown service: {service_name}")
         
     | 
| 
      
 159 
     | 
    
         
            +
                        return health_pb2.HealthCheckResponse(
         
     | 
| 
      
 160 
     | 
    
         
            +
                            status=health_pb2.HealthCheckResponse.SERVICE_UNKNOWN
         
     | 
| 
      
 161 
     | 
    
         
            +
                        )
         
     | 
| 
      
 162 
     | 
    
         
            +
             
     | 
| 
      
 163 
     | 
    
         
            +
                async def Watch(
         
     | 
| 
      
 164 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 165 
     | 
    
         
            +
                    request: health_pb2.HealthCheckRequest,
         
     | 
| 
      
 166 
     | 
    
         
            +
                    context: grpc.aio.ServicerContext,
         
     | 
| 
      
 167 
     | 
    
         
            +
                ) -> AsyncIterator[health_pb2.HealthCheckResponse]:
         
     | 
| 
      
 168 
     | 
    
         
            +
                    """
         
     | 
| 
      
 169 
     | 
    
         
            +
                    Streaming health check - sends updates when status changes.
         
     | 
| 
      
 170 
     | 
    
         
            +
             
     | 
| 
      
 171 
     | 
    
         
            +
                    For now, just send current status once (Kubernetes doesn't use Watch).
         
     | 
| 
      
 172 
     | 
    
         
            +
                    A full implementation would monitor status changes and stream updates.
         
     | 
| 
      
 173 
     | 
    
         
            +
             
     | 
| 
      
 174 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 175 
     | 
    
         
            +
                        request: Contains service name
         
     | 
| 
      
 176 
     | 
    
         
            +
                        context: gRPC context
         
     | 
| 
      
 177 
     | 
    
         
            +
             
     | 
| 
      
 178 
     | 
    
         
            +
                    Yields:
         
     | 
| 
      
 179 
     | 
    
         
            +
                        HealthCheckResponse messages when status changes
         
     | 
| 
      
 180 
     | 
    
         
            +
                    """
         
     | 
| 
      
 181 
     | 
    
         
            +
                    service_name = request.service
         
     | 
| 
      
 182 
     | 
    
         
            +
                    logger.debug(f"Health watch request for service: '{service_name}'")
         
     | 
| 
      
 183 
     | 
    
         
            +
             
     | 
| 
      
 184 
     | 
    
         
            +
                    # Send current status
         
     | 
| 
      
 185 
     | 
    
         
            +
                    response = await self.Check(request, context)
         
     | 
| 
      
 186 
     | 
    
         
            +
                    yield response
         
     | 
| 
      
 187 
     | 
    
         
            +
             
     | 
| 
      
 188 
     | 
    
         
            +
                    # Note: Full Watch implementation would monitor status changes
         
     | 
| 
      
 189 
     | 
    
         
            +
                    # and stream updates. For K8s probes, Check is sufficient.
         
     | 
| 
         @@ -0,0 +1,181 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            """
         
     | 
| 
      
 2 
     | 
    
         
            +
            Scheduler process management for gRPC server.
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            This module handles launching and managing scheduler processes for the gRPC server,
         
     | 
| 
      
 5 
     | 
    
         
            +
            including tensor parallelism, pipeline parallelism, and data parallelism configurations.
         
     | 
| 
      
 6 
     | 
    
         
            +
            """
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            import logging
         
     | 
| 
      
 9 
     | 
    
         
            +
            import multiprocessing as mp
         
     | 
| 
      
 10 
     | 
    
         
            +
            import signal
         
     | 
| 
      
 11 
     | 
    
         
            +
            from typing import Dict, List, Optional, Tuple
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            from sglang.srt.managers.data_parallel_controller import (
         
     | 
| 
      
 14 
     | 
    
         
            +
                run_data_parallel_controller_process,
         
     | 
| 
      
 15 
     | 
    
         
            +
            )
         
     | 
| 
      
 16 
     | 
    
         
            +
            from sglang.srt.managers.scheduler import run_scheduler_process
         
     | 
| 
      
 17 
     | 
    
         
            +
            from sglang.srt.server_args import PortArgs, ServerArgs
         
     | 
| 
      
 18 
     | 
    
         
            +
            from sglang.srt.utils import configure_logger, prepare_model_and_tokenizer
         
     | 
| 
      
 19 
     | 
    
         
            +
            from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            logger = logging.getLogger(__name__)
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
            def run_scheduler_with_signal_handling(*args, **kwargs):
         
     | 
| 
      
 25 
     | 
    
         
            +
                """
         
     | 
| 
      
 26 
     | 
    
         
            +
                Wrapper for run_scheduler_process that ignores SIGINT.
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
                The scheduler process should not handle Ctrl+C - it should only terminate
         
     | 
| 
      
 29 
     | 
    
         
            +
                when the parent gRPC server exits (via kill_itself_when_parent_died).
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                Args:
         
     | 
| 
      
 32 
     | 
    
         
            +
                    *args: Positional arguments for run_scheduler_process
         
     | 
| 
      
 33 
     | 
    
         
            +
                    **kwargs: Keyword arguments for run_scheduler_process
         
     | 
| 
      
 34 
     | 
    
         
            +
                """
         
     | 
| 
      
 35 
     | 
    
         
            +
                # Ignore SIGINT in this subprocess - let the parent handle it
         
     | 
| 
      
 36 
     | 
    
         
            +
                signal.signal(signal.SIGINT, signal.SIG_IGN)
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                # Now run the actual scheduler process
         
     | 
| 
      
 39 
     | 
    
         
            +
                run_scheduler_process(*args, **kwargs)
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
            def launch_scheduler_process_only(
         
     | 
| 
      
 43 
     | 
    
         
            +
                server_args: ServerArgs,
         
     | 
| 
      
 44 
     | 
    
         
            +
                port_args: Optional[PortArgs] = None,
         
     | 
| 
      
 45 
     | 
    
         
            +
            ) -> Tuple[Dict, PortArgs, List[mp.Process]]:
         
     | 
| 
      
 46 
     | 
    
         
            +
                """
         
     | 
| 
      
 47 
     | 
    
         
            +
                Launch only the scheduler process(es) without tokenizer/detokenizer.
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                This function handles all scheduler startup logic including:
         
     | 
| 
      
 50 
     | 
    
         
            +
                - Tensor parallelism (tp_size)
         
     | 
| 
      
 51 
     | 
    
         
            +
                - Pipeline parallelism (pp_size)
         
     | 
| 
      
 52 
     | 
    
         
            +
                - Data parallelism (dp_size)
         
     | 
| 
      
 53 
     | 
    
         
            +
                - Multi-node distributed setup
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                Args:
         
     | 
| 
      
 56 
     | 
    
         
            +
                    server_args: Server configuration
         
     | 
| 
      
 57 
     | 
    
         
            +
                    port_args: Port configuration (created if None)
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
                Returns:
         
     | 
| 
      
 60 
     | 
    
         
            +
                    Tuple of (scheduler_info, port_args, scheduler_processes):
         
     | 
| 
      
 61 
     | 
    
         
            +
                    - scheduler_info: Dict with model metadata and configuration
         
     | 
| 
      
 62 
     | 
    
         
            +
                    - port_args: Port configuration used for IPC
         
     | 
| 
      
 63 
     | 
    
         
            +
                    - scheduler_processes: List of launched scheduler Process objects
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                Raises:
         
     | 
| 
      
 66 
     | 
    
         
            +
                    RuntimeError: If any scheduler process fails to initialize
         
     | 
| 
      
 67 
     | 
    
         
            +
                """
         
     | 
| 
      
 68 
     | 
    
         
            +
                # Configure global environment
         
     | 
| 
      
 69 
     | 
    
         
            +
                configure_logger(server_args)
         
     | 
| 
      
 70 
     | 
    
         
            +
                server_args.check_server_args()
         
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
                # Fix CUDA multiprocessing issues - must be called before any CUDA operations
         
     | 
| 
      
 73 
     | 
    
         
            +
                mp.set_start_method("spawn", force=True)
         
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
                # Allocate ports for inter-process communications
         
     | 
| 
      
 76 
     | 
    
         
            +
                if port_args is None:
         
     | 
| 
      
 77 
     | 
    
         
            +
                    port_args = PortArgs.init_new(server_args)
         
     | 
| 
      
 78 
     | 
    
         
            +
                    logger.info(f"{server_args=}")
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
                # Prepare model and tokenizer paths
         
     | 
| 
      
 81 
     | 
    
         
            +
                server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
         
     | 
| 
      
 82 
     | 
    
         
            +
                    server_args.model_path, server_args.tokenizer_path
         
     | 
| 
      
 83 
     | 
    
         
            +
                )
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
                scheduler_procs = []
         
     | 
| 
      
 86 
     | 
    
         
            +
             
     | 
| 
      
 87 
     | 
    
         
            +
                if server_args.dp_size == 1:
         
     | 
| 
      
 88 
     | 
    
         
            +
                    # Single data parallel group - launch TP/PP schedulers
         
     | 
| 
      
 89 
     | 
    
         
            +
                    memory_saver_adapter = TorchMemorySaverAdapter.create(
         
     | 
| 
      
 90 
     | 
    
         
            +
                        enable=server_args.enable_memory_saver
         
     | 
| 
      
 91 
     | 
    
         
            +
                    )
         
     | 
| 
      
 92 
     | 
    
         
            +
                    scheduler_pipe_readers = []
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
                    # Calculate TP/PP distribution across nodes
         
     | 
| 
      
 95 
     | 
    
         
            +
                    nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
         
     | 
| 
      
 96 
     | 
    
         
            +
                    tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
         
     | 
| 
      
 97 
     | 
    
         
            +
                    tp_rank_range = range(
         
     | 
| 
      
 98 
     | 
    
         
            +
                        tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
         
     | 
| 
      
 99 
     | 
    
         
            +
                        tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
         
     | 
| 
      
 100 
     | 
    
         
            +
                    )
         
     | 
| 
      
 101 
     | 
    
         
            +
             
     | 
| 
      
 102 
     | 
    
         
            +
                    pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
         
     | 
| 
      
 103 
     | 
    
         
            +
                    pp_rank_range = range(
         
     | 
| 
      
 104 
     | 
    
         
            +
                        pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
         
     | 
| 
      
 105 
     | 
    
         
            +
                        pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
         
     | 
| 
      
 106 
     | 
    
         
            +
                    )
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
                    # Launch scheduler for each TP/PP rank combination
         
     | 
| 
      
 109 
     | 
    
         
            +
                    for pp_rank in pp_rank_range:
         
     | 
| 
      
 110 
     | 
    
         
            +
                        for tp_rank in tp_rank_range:
         
     | 
| 
      
 111 
     | 
    
         
            +
                            reader, writer = mp.Pipe(duplex=False)
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                            # Calculate GPU ID for this rank
         
     | 
| 
      
 114 
     | 
    
         
            +
                            gpu_id = (
         
     | 
| 
      
 115 
     | 
    
         
            +
                                server_args.base_gpu_id
         
     | 
| 
      
 116 
     | 
    
         
            +
                                + ((pp_rank % pp_size_per_node) * tp_size_per_node)
         
     | 
| 
      
 117 
     | 
    
         
            +
                                + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
         
     | 
| 
      
 118 
     | 
    
         
            +
                            )
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
                            # Calculate MoE expert parallel rank
         
     | 
| 
      
 121 
     | 
    
         
            +
                            moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
         
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
                            # Create scheduler process
         
     | 
| 
      
 124 
     | 
    
         
            +
                            proc = mp.Process(
         
     | 
| 
      
 125 
     | 
    
         
            +
                                target=run_scheduler_with_signal_handling,
         
     | 
| 
      
 126 
     | 
    
         
            +
                                args=(
         
     | 
| 
      
 127 
     | 
    
         
            +
                                    server_args,
         
     | 
| 
      
 128 
     | 
    
         
            +
                                    port_args,
         
     | 
| 
      
 129 
     | 
    
         
            +
                                    gpu_id,
         
     | 
| 
      
 130 
     | 
    
         
            +
                                    tp_rank,
         
     | 
| 
      
 131 
     | 
    
         
            +
                                    moe_ep_rank,
         
     | 
| 
      
 132 
     | 
    
         
            +
                                    pp_rank,
         
     | 
| 
      
 133 
     | 
    
         
            +
                                    None,  # dp_rank
         
     | 
| 
      
 134 
     | 
    
         
            +
                                    writer,
         
     | 
| 
      
 135 
     | 
    
         
            +
                                ),
         
     | 
| 
      
 136 
     | 
    
         
            +
                            )
         
     | 
| 
      
 137 
     | 
    
         
            +
             
     | 
| 
      
 138 
     | 
    
         
            +
                            with memory_saver_adapter.configure_subprocess():
         
     | 
| 
      
 139 
     | 
    
         
            +
                                proc.start()
         
     | 
| 
      
 140 
     | 
    
         
            +
             
     | 
| 
      
 141 
     | 
    
         
            +
                            scheduler_procs.append(proc)
         
     | 
| 
      
 142 
     | 
    
         
            +
                            scheduler_pipe_readers.append(reader)
         
     | 
| 
      
 143 
     | 
    
         
            +
                else:
         
     | 
| 
      
 144 
     | 
    
         
            +
                    # Data parallelism - launch data parallel controller
         
     | 
| 
      
 145 
     | 
    
         
            +
                    reader, writer = mp.Pipe(duplex=False)
         
     | 
| 
      
 146 
     | 
    
         
            +
                    scheduler_pipe_readers = [reader]
         
     | 
| 
      
 147 
     | 
    
         
            +
             
     | 
| 
      
 148 
     | 
    
         
            +
                    proc = mp.Process(
         
     | 
| 
      
 149 
     | 
    
         
            +
                        target=run_data_parallel_controller_process,
         
     | 
| 
      
 150 
     | 
    
         
            +
                        args=(server_args, port_args, writer),
         
     | 
| 
      
 151 
     | 
    
         
            +
                    )
         
     | 
| 
      
 152 
     | 
    
         
            +
                    proc.start()
         
     | 
| 
      
 153 
     | 
    
         
            +
                    scheduler_procs.append(proc)
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
                # TODO(CatherineSue): handle cases for multi-node
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                # Wait for all scheduler processes to be ready
         
     | 
| 
      
 158 
     | 
    
         
            +
                scheduler_infos = []
         
     | 
| 
      
 159 
     | 
    
         
            +
                for i, reader in enumerate(scheduler_pipe_readers):
         
     | 
| 
      
 160 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 161 
     | 
    
         
            +
                        data = reader.recv()
         
     | 
| 
      
 162 
     | 
    
         
            +
                    except EOFError:
         
     | 
| 
      
 163 
     | 
    
         
            +
                        logger.error(
         
     | 
| 
      
 164 
     | 
    
         
            +
                            f"Rank {i} scheduler is dead. Please check if there are relevant logs."
         
     | 
| 
      
 165 
     | 
    
         
            +
                        )
         
     | 
| 
      
 166 
     | 
    
         
            +
                        scheduler_procs[i].join()
         
     | 
| 
      
 167 
     | 
    
         
            +
                        logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
         
     | 
| 
      
 168 
     | 
    
         
            +
                        raise RuntimeError(f"Failed to initialize scheduler rank {i}")
         
     | 
| 
      
 169 
     | 
    
         
            +
             
     | 
| 
      
 170 
     | 
    
         
            +
                    if data.get("status") != "ready":
         
     | 
| 
      
 171 
     | 
    
         
            +
                        raise RuntimeError(
         
     | 
| 
      
 172 
     | 
    
         
            +
                            f"Scheduler rank {i} initialization failed: {data.get('error', 'Unknown error')}"
         
     | 
| 
      
 173 
     | 
    
         
            +
                        )
         
     | 
| 
      
 174 
     | 
    
         
            +
                    scheduler_infos.append(data)
         
     | 
| 
      
 175 
     | 
    
         
            +
             
     | 
| 
      
 176 
     | 
    
         
            +
                logger.info(
         
     | 
| 
      
 177 
     | 
    
         
            +
                    f"All {len(scheduler_procs)} scheduler process(es) initialized successfully"
         
     | 
| 
      
 178 
     | 
    
         
            +
                )
         
     | 
| 
      
 179 
     | 
    
         
            +
             
     | 
| 
      
 180 
     | 
    
         
            +
                # Return the first scheduler's info (they should all be the same)
         
     | 
| 
      
 181 
     | 
    
         
            +
                return scheduler_infos[0], port_args, scheduler_procs
         
     |