sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,810 @@
|
|
1
|
+
"""
|
2
|
+
Standalone gRPC Server for SGLang - Fully separated from HTTP server.
|
3
|
+
Uses GrpcRequestManager for orchestration without tokenization.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import argparse
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import multiprocessing as mp
|
10
|
+
import os
|
11
|
+
import signal
|
12
|
+
import time
|
13
|
+
from concurrent import futures
|
14
|
+
from typing import AsyncIterator, Dict, Optional, Tuple
|
15
|
+
|
16
|
+
import grpc
|
17
|
+
from grpc_reflection.v1alpha import reflection
|
18
|
+
|
19
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
20
|
+
from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
|
21
|
+
from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
|
22
|
+
from sglang.srt.managers.data_parallel_controller import (
|
23
|
+
run_data_parallel_controller_process,
|
24
|
+
)
|
25
|
+
from sglang.srt.managers.disagg_service import start_disagg_service
|
26
|
+
from sglang.srt.managers.io_struct import (
|
27
|
+
TokenizedEmbeddingReqInput,
|
28
|
+
TokenizedGenerateReqInput,
|
29
|
+
)
|
30
|
+
from sglang.srt.managers.scheduler import run_scheduler_process
|
31
|
+
from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
|
32
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
33
|
+
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
34
|
+
from sglang.srt.utils import configure_logger, prepare_model_and_tokenizer
|
35
|
+
from sglang.utils import get_exception_traceback
|
36
|
+
|
37
|
+
logger = logging.getLogger(__name__)
|
38
|
+
HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
39
|
+
|
40
|
+
|
41
|
+
def _run_scheduler_with_signal_handling(*args, **kwargs):
|
42
|
+
"""
|
43
|
+
Wrapper for run_scheduler_process that ignores SIGINT.
|
44
|
+
|
45
|
+
The scheduler process should not handle Ctrl+C - it should only terminate
|
46
|
+
when the parent gRPC server exits (via kill_itself_when_parent_died).
|
47
|
+
"""
|
48
|
+
# Ignore SIGINT in this subprocess - let the parent handle it
|
49
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
50
|
+
|
51
|
+
# Now run the actual scheduler process
|
52
|
+
run_scheduler_process(*args, **kwargs)
|
53
|
+
|
54
|
+
|
55
|
+
def _launch_scheduler_process_only(
|
56
|
+
server_args: ServerArgs,
|
57
|
+
port_args: Optional[PortArgs] = None,
|
58
|
+
) -> Tuple[Dict, PortArgs, list]:
|
59
|
+
"""
|
60
|
+
Launch only the scheduler process(es) without tokenizer/detokenizer.
|
61
|
+
Returns scheduler info, port args, and list of scheduler processes.
|
62
|
+
"""
|
63
|
+
# Configure global environment
|
64
|
+
configure_logger(server_args)
|
65
|
+
server_args.check_server_args()
|
66
|
+
|
67
|
+
# Allocate ports for inter-process communications
|
68
|
+
if port_args is None:
|
69
|
+
port_args = PortArgs.init_new(server_args)
|
70
|
+
logger.info(f"{server_args=}")
|
71
|
+
|
72
|
+
# Prepare model and tokenizer paths
|
73
|
+
server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
|
74
|
+
server_args.model_path, server_args.tokenizer_path
|
75
|
+
)
|
76
|
+
|
77
|
+
scheduler_procs = []
|
78
|
+
if server_args.dp_size == 1:
|
79
|
+
memory_saver_adapter = TorchMemorySaverAdapter.create(
|
80
|
+
enable=server_args.enable_memory_saver
|
81
|
+
)
|
82
|
+
scheduler_pipe_readers = []
|
83
|
+
|
84
|
+
nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
|
85
|
+
tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
|
86
|
+
tp_rank_range = range(
|
87
|
+
tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
|
88
|
+
tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
|
89
|
+
)
|
90
|
+
|
91
|
+
pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
|
92
|
+
pp_rank_range = range(
|
93
|
+
pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
|
94
|
+
pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
|
95
|
+
)
|
96
|
+
|
97
|
+
for pp_rank in pp_rank_range:
|
98
|
+
for tp_rank in tp_rank_range:
|
99
|
+
reader, writer = mp.Pipe(duplex=False)
|
100
|
+
gpu_id = (
|
101
|
+
server_args.base_gpu_id
|
102
|
+
+ ((pp_rank % pp_size_per_node) * tp_size_per_node)
|
103
|
+
+ (tp_rank % tp_size_per_node) * server_args.gpu_id_step
|
104
|
+
)
|
105
|
+
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
106
|
+
proc = mp.Process(
|
107
|
+
target=_run_scheduler_with_signal_handling,
|
108
|
+
args=(
|
109
|
+
server_args,
|
110
|
+
port_args,
|
111
|
+
gpu_id,
|
112
|
+
tp_rank,
|
113
|
+
moe_ep_rank,
|
114
|
+
pp_rank,
|
115
|
+
None,
|
116
|
+
writer,
|
117
|
+
),
|
118
|
+
)
|
119
|
+
|
120
|
+
with memory_saver_adapter.configure_subprocess():
|
121
|
+
proc.start()
|
122
|
+
scheduler_procs.append(proc)
|
123
|
+
scheduler_pipe_readers.append(reader)
|
124
|
+
else:
|
125
|
+
# Launch the data parallel controller
|
126
|
+
reader, writer = mp.Pipe(duplex=False)
|
127
|
+
scheduler_pipe_readers = [reader]
|
128
|
+
proc = mp.Process(
|
129
|
+
target=run_data_parallel_controller_process,
|
130
|
+
args=(server_args, port_args, writer),
|
131
|
+
)
|
132
|
+
proc.start()
|
133
|
+
scheduler_procs.append(proc)
|
134
|
+
|
135
|
+
# TODO(CatherineSue): handle cases for multi-node
|
136
|
+
|
137
|
+
# Wait for all scheduler processes to be ready
|
138
|
+
scheduler_infos = []
|
139
|
+
for i, reader in enumerate(scheduler_pipe_readers):
|
140
|
+
try:
|
141
|
+
data = reader.recv()
|
142
|
+
except EOFError:
|
143
|
+
logger.error(
|
144
|
+
f"Rank {i} scheduler is dead. Please check if there are relevant logs."
|
145
|
+
)
|
146
|
+
scheduler_procs[i].join()
|
147
|
+
logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
|
148
|
+
raise RuntimeError(f"Failed to initialize scheduler rank {i}")
|
149
|
+
|
150
|
+
if data.get("status") != "ready":
|
151
|
+
raise RuntimeError(
|
152
|
+
f"Scheduler rank {i} initialization failed: {data.get('error', 'Unknown error')}"
|
153
|
+
)
|
154
|
+
scheduler_infos.append(data)
|
155
|
+
|
156
|
+
logger.info(
|
157
|
+
f"All {len(scheduler_procs)} scheduler process(es) initialized successfully"
|
158
|
+
)
|
159
|
+
|
160
|
+
# Return the first scheduler's info (they should all be the same)
|
161
|
+
return scheduler_infos[0], port_args, scheduler_procs
|
162
|
+
|
163
|
+
|
164
|
+
class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer):
|
165
|
+
"""
|
166
|
+
Standalone gRPC service implementation using GrpcRequestManager.
|
167
|
+
Fully separated from HTTP server with its own process and no shared globals.
|
168
|
+
"""
|
169
|
+
|
170
|
+
def __init__(
|
171
|
+
self,
|
172
|
+
request_manager: GrpcRequestManager,
|
173
|
+
server_args: ServerArgs,
|
174
|
+
model_info: Dict,
|
175
|
+
):
|
176
|
+
"""Initialize the standalone gRPC service."""
|
177
|
+
self.request_manager = request_manager
|
178
|
+
self.server_args = server_args
|
179
|
+
self.model_info = model_info
|
180
|
+
self.start_time = time.time()
|
181
|
+
|
182
|
+
# Start the request manager's event loop using auto_create_handle_loop
|
183
|
+
self.request_manager.auto_create_handle_loop()
|
184
|
+
|
185
|
+
logger.info("Standalone gRPC scheduler service initialized")
|
186
|
+
|
187
|
+
async def Generate(
|
188
|
+
self,
|
189
|
+
request: sglang_scheduler_pb2.GenerateRequest,
|
190
|
+
context: grpc.aio.ServicerContext,
|
191
|
+
) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
|
192
|
+
"""Handle generation requests with streaming responses."""
|
193
|
+
logger.info(f"Generation request: {request.request_id}")
|
194
|
+
|
195
|
+
try:
|
196
|
+
# Convert gRPC request to internal format
|
197
|
+
tokenized_req = self._convert_generate_request(request)
|
198
|
+
|
199
|
+
# Submit to request manager (automatically handles n>1)
|
200
|
+
response_generator = self.request_manager.generate_request(
|
201
|
+
obj=tokenized_req,
|
202
|
+
request_id=request.request_id,
|
203
|
+
grpc_context=context,
|
204
|
+
)
|
205
|
+
|
206
|
+
async for output in response_generator:
|
207
|
+
# Handle batch responses (for n>1 non-streaming)
|
208
|
+
if isinstance(output, list):
|
209
|
+
for batch_output in output:
|
210
|
+
if "error" in batch_output:
|
211
|
+
yield sglang_scheduler_pb2.GenerateResponse(
|
212
|
+
request_id=request.request_id,
|
213
|
+
error=sglang_scheduler_pb2.GenerateError(
|
214
|
+
message=batch_output["error"],
|
215
|
+
http_status_code=(
|
216
|
+
"500" if "abort" not in batch_output else "499"
|
217
|
+
),
|
218
|
+
),
|
219
|
+
)
|
220
|
+
else:
|
221
|
+
# All non-error batch outputs are final responses
|
222
|
+
yield self._create_completion_response(
|
223
|
+
request.request_id, batch_output
|
224
|
+
)
|
225
|
+
else:
|
226
|
+
# Handle single response (for streaming or n=1 non-streaming)
|
227
|
+
if "error" in output:
|
228
|
+
yield sglang_scheduler_pb2.GenerateResponse(
|
229
|
+
request_id=request.request_id,
|
230
|
+
error=sglang_scheduler_pb2.GenerateError(
|
231
|
+
message=output["error"],
|
232
|
+
http_status_code=(
|
233
|
+
"500" if "abort" not in output else "499"
|
234
|
+
),
|
235
|
+
),
|
236
|
+
)
|
237
|
+
elif output.get("finished", False):
|
238
|
+
yield self._create_completion_response(
|
239
|
+
request.request_id, output
|
240
|
+
)
|
241
|
+
else:
|
242
|
+
yield self._create_chunk_response(request.request_id, output)
|
243
|
+
|
244
|
+
except Exception as e:
|
245
|
+
logger.error(f"Generate failed: {e}\n{get_exception_traceback()}")
|
246
|
+
yield sglang_scheduler_pb2.GenerateResponse(
|
247
|
+
request_id=request.request_id,
|
248
|
+
error=sglang_scheduler_pb2.GenerateError(
|
249
|
+
message=str(e),
|
250
|
+
http_status_code="500",
|
251
|
+
details=get_exception_traceback(),
|
252
|
+
),
|
253
|
+
)
|
254
|
+
|
255
|
+
async def Embed(
|
256
|
+
self,
|
257
|
+
request: sglang_scheduler_pb2.EmbedRequest,
|
258
|
+
context: grpc.aio.ServicerContext,
|
259
|
+
) -> sglang_scheduler_pb2.EmbedResponse:
|
260
|
+
"""Handle embedding requests."""
|
261
|
+
logger.info(f"Embedding request: {request.request_id}")
|
262
|
+
|
263
|
+
try:
|
264
|
+
# Convert request
|
265
|
+
tokenized_req = self._convert_embed_request(request)
|
266
|
+
|
267
|
+
# Submit to request manager
|
268
|
+
future = await self.request_manager.embedding_request(
|
269
|
+
obj=tokenized_req,
|
270
|
+
request_id=request.request_id,
|
271
|
+
)
|
272
|
+
|
273
|
+
# Wait for result
|
274
|
+
result = await future
|
275
|
+
|
276
|
+
# Create response
|
277
|
+
return sglang_scheduler_pb2.EmbedResponse(
|
278
|
+
request_id=request.request_id,
|
279
|
+
complete=sglang_scheduler_pb2.EmbedComplete(
|
280
|
+
embedding=result["embedding"],
|
281
|
+
prompt_tokens=result.get("prompt_tokens", 0),
|
282
|
+
cached_tokens=0,
|
283
|
+
embedding_dim=len(result["embedding"]),
|
284
|
+
),
|
285
|
+
)
|
286
|
+
|
287
|
+
except Exception as e:
|
288
|
+
logger.error(f"Embed failed: {e}\n{get_exception_traceback()}")
|
289
|
+
return sglang_scheduler_pb2.EmbedResponse(
|
290
|
+
request_id=request.request_id,
|
291
|
+
error=sglang_scheduler_pb2.EmbedError(
|
292
|
+
message=str(e),
|
293
|
+
code="INTERNAL_ERROR",
|
294
|
+
details=get_exception_traceback(),
|
295
|
+
),
|
296
|
+
)
|
297
|
+
|
298
|
+
async def HealthCheck(
|
299
|
+
self,
|
300
|
+
request: sglang_scheduler_pb2.HealthCheckRequest,
|
301
|
+
context: grpc.aio.ServicerContext,
|
302
|
+
) -> sglang_scheduler_pb2.HealthCheckResponse:
|
303
|
+
"""Health check by generating from client input."""
|
304
|
+
try:
|
305
|
+
# Check if request manager is shutting down
|
306
|
+
if self.request_manager.gracefully_exit:
|
307
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
308
|
+
healthy=False, message="Server shutting down"
|
309
|
+
)
|
310
|
+
|
311
|
+
# Extract tokenized input from request
|
312
|
+
if not request.HasField("tokenized"):
|
313
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
314
|
+
healthy=False, message="Tokenized input required for health check"
|
315
|
+
)
|
316
|
+
|
317
|
+
input_text = request.tokenized.original_text
|
318
|
+
input_ids = list(request.tokenized.input_ids)
|
319
|
+
|
320
|
+
# Create health check request
|
321
|
+
rid = f"HEALTH_CHECK_GRPC_{time.time()}"
|
322
|
+
|
323
|
+
health_request = TokenizedGenerateReqInput(
|
324
|
+
rid=rid,
|
325
|
+
input_text=input_text,
|
326
|
+
input_ids=input_ids,
|
327
|
+
sampling_params=SGLSamplingParams(max_new_tokens=1, temperature=0.0),
|
328
|
+
stream=False,
|
329
|
+
mm_inputs=None,
|
330
|
+
return_logprob=False,
|
331
|
+
logprob_start_len=-1,
|
332
|
+
top_logprobs_num=0,
|
333
|
+
token_ids_logprob=None,
|
334
|
+
)
|
335
|
+
|
336
|
+
if self.server_args.disaggregation_mode != DisaggregationMode.NULL:
|
337
|
+
health_request.bootstrap_host = FAKE_BOOTSTRAP_HOST
|
338
|
+
health_request.bootstrap_room = 0
|
339
|
+
|
340
|
+
logger.info(f"Sending health check request to request manager...")
|
341
|
+
|
342
|
+
# Submit and wait for response
|
343
|
+
output_generator = self.request_manager.generate_request(
|
344
|
+
health_request, request_id=rid
|
345
|
+
)
|
346
|
+
|
347
|
+
try:
|
348
|
+
# Get first response with timeout
|
349
|
+
response = await asyncio.wait_for(
|
350
|
+
output_generator.__anext__(), timeout=HEALTH_CHECK_TIMEOUT
|
351
|
+
)
|
352
|
+
|
353
|
+
# Clean up
|
354
|
+
if rid in self.request_manager.rid_to_state:
|
355
|
+
del self.request_manager.rid_to_state[rid]
|
356
|
+
|
357
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
358
|
+
healthy=True, message="Health check passed"
|
359
|
+
)
|
360
|
+
|
361
|
+
except asyncio.TimeoutError:
|
362
|
+
# Clean up on timeout
|
363
|
+
if rid in self.request_manager.rid_to_state:
|
364
|
+
del self.request_manager.rid_to_state[rid]
|
365
|
+
|
366
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
367
|
+
healthy=False, message="Health check timeout"
|
368
|
+
)
|
369
|
+
|
370
|
+
except Exception as e:
|
371
|
+
logger.error(f"Health check failed: {e}")
|
372
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
373
|
+
healthy=False, message=f"Health check error: {str(e)}"
|
374
|
+
)
|
375
|
+
|
376
|
+
async def Abort(
|
377
|
+
self,
|
378
|
+
request: sglang_scheduler_pb2.AbortRequest,
|
379
|
+
context: grpc.aio.ServicerContext,
|
380
|
+
) -> sglang_scheduler_pb2.AbortResponse:
|
381
|
+
"""Abort an ongoing request."""
|
382
|
+
logger.info(f"Aborting request: {request.request_id}")
|
383
|
+
|
384
|
+
try:
|
385
|
+
success = await self.request_manager.abort_request(request.request_id)
|
386
|
+
|
387
|
+
return sglang_scheduler_pb2.AbortResponse(
|
388
|
+
success=success,
|
389
|
+
message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
|
390
|
+
)
|
391
|
+
except Exception as e:
|
392
|
+
logger.error(f"Abort failed: {e}")
|
393
|
+
return sglang_scheduler_pb2.AbortResponse(
|
394
|
+
success=False,
|
395
|
+
message=str(e),
|
396
|
+
)
|
397
|
+
|
398
|
+
# Helper methods for request/response conversion
|
399
|
+
|
400
|
+
def _convert_generate_request(
|
401
|
+
self, grpc_req: sglang_scheduler_pb2.GenerateRequest
|
402
|
+
) -> TokenizedGenerateReqInput:
|
403
|
+
"""Convert gRPC GenerateRequest to internal format."""
|
404
|
+
|
405
|
+
# Extract tokenized input
|
406
|
+
if not grpc_req.HasField("tokenized"):
|
407
|
+
raise ValueError("Tokenized input must be provided")
|
408
|
+
|
409
|
+
input_text = grpc_req.tokenized.original_text
|
410
|
+
input_ids = list(grpc_req.tokenized.input_ids)
|
411
|
+
|
412
|
+
# Convert sampling params
|
413
|
+
sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
|
414
|
+
|
415
|
+
# Extract disaggregated params if present
|
416
|
+
bootstrap_host = None
|
417
|
+
bootstrap_port = None
|
418
|
+
bootstrap_room = None
|
419
|
+
if grpc_req.HasField("disaggregated_params"):
|
420
|
+
bootstrap_host = grpc_req.disaggregated_params.bootstrap_host or None
|
421
|
+
bootstrap_port = grpc_req.disaggregated_params.bootstrap_port or None
|
422
|
+
bootstrap_room = grpc_req.disaggregated_params.bootstrap_room or None
|
423
|
+
|
424
|
+
# Create request
|
425
|
+
return TokenizedGenerateReqInput(
|
426
|
+
rid=grpc_req.request_id,
|
427
|
+
input_text=input_text,
|
428
|
+
input_ids=input_ids,
|
429
|
+
mm_inputs=None, # TODO: implement mm support
|
430
|
+
sampling_params=sampling_params,
|
431
|
+
return_logprob=grpc_req.return_logprob,
|
432
|
+
logprob_start_len=(
|
433
|
+
grpc_req.logprob_start_len
|
434
|
+
if grpc_req.logprob_start_len is not None
|
435
|
+
else -1
|
436
|
+
),
|
437
|
+
top_logprobs_num=grpc_req.top_logprobs_num or 0,
|
438
|
+
stream=grpc_req.stream or False,
|
439
|
+
lora_id=grpc_req.lora_id if grpc_req.lora_id else None,
|
440
|
+
token_ids_logprob=(
|
441
|
+
list(grpc_req.token_ids_logprob) if grpc_req.token_ids_logprob else None
|
442
|
+
),
|
443
|
+
bootstrap_host=bootstrap_host,
|
444
|
+
bootstrap_port=bootstrap_port,
|
445
|
+
bootstrap_room=bootstrap_room,
|
446
|
+
)
|
447
|
+
|
448
|
+
def _convert_embed_request(
|
449
|
+
self, grpc_req: sglang_scheduler_pb2.EmbedRequest
|
450
|
+
) -> TokenizedEmbeddingReqInput:
|
451
|
+
"""Convert gRPC EmbedRequest to internal format."""
|
452
|
+
|
453
|
+
# Extract tokenized input
|
454
|
+
if not grpc_req.HasField("tokenized"):
|
455
|
+
raise ValueError("Tokenized input must be provided")
|
456
|
+
|
457
|
+
input_text = grpc_req.tokenized.original_text
|
458
|
+
input_ids = list(grpc_req.tokenized.input_ids)
|
459
|
+
|
460
|
+
return TokenizedEmbeddingReqInput(
|
461
|
+
rid=grpc_req.request_id,
|
462
|
+
input_text=input_text,
|
463
|
+
input_ids=input_ids,
|
464
|
+
)
|
465
|
+
|
466
|
+
def _convert_sampling_params(
|
467
|
+
self, grpc_params: sglang_scheduler_pb2.SamplingParams
|
468
|
+
) -> SGLSamplingParams:
|
469
|
+
"""Convert gRPC SamplingParams to internal format."""
|
470
|
+
|
471
|
+
# Handle constraint types
|
472
|
+
regex = None
|
473
|
+
json_schema = None
|
474
|
+
ebnf_grammar = None
|
475
|
+
structural_tag = None
|
476
|
+
|
477
|
+
if grpc_params.HasField("regex"):
|
478
|
+
regex = grpc_params.regex
|
479
|
+
elif grpc_params.HasField("json_schema"):
|
480
|
+
json_schema = grpc_params.json_schema
|
481
|
+
elif grpc_params.HasField("ebnf_grammar"):
|
482
|
+
ebnf_grammar = grpc_params.ebnf_grammar
|
483
|
+
elif grpc_params.HasField("structural_tag"):
|
484
|
+
structural_tag = grpc_params.structural_tag
|
485
|
+
|
486
|
+
return SGLSamplingParams(
|
487
|
+
temperature=grpc_params.temperature or 1.0,
|
488
|
+
top_p=grpc_params.top_p or 1.0,
|
489
|
+
top_k=grpc_params.top_k or -1,
|
490
|
+
min_p=grpc_params.min_p or 0.0,
|
491
|
+
frequency_penalty=grpc_params.frequency_penalty or 0.0,
|
492
|
+
presence_penalty=grpc_params.presence_penalty or 0.0,
|
493
|
+
repetition_penalty=grpc_params.repetition_penalty or 1.0,
|
494
|
+
max_new_tokens=grpc_params.max_new_tokens or 128,
|
495
|
+
min_new_tokens=grpc_params.min_new_tokens or 0,
|
496
|
+
stop=list(grpc_params.stop) if grpc_params.stop else [],
|
497
|
+
stop_token_ids=(
|
498
|
+
list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else []
|
499
|
+
),
|
500
|
+
skip_special_tokens=grpc_params.skip_special_tokens,
|
501
|
+
spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
|
502
|
+
regex=regex,
|
503
|
+
json_schema=json_schema,
|
504
|
+
ebnf=ebnf_grammar,
|
505
|
+
structural_tag=structural_tag,
|
506
|
+
n=grpc_params.n or 1,
|
507
|
+
ignore_eos=grpc_params.ignore_eos,
|
508
|
+
)
|
509
|
+
|
510
|
+
def _convert_output_logprobs_to_proto(
|
511
|
+
self, logprobs_data: Dict
|
512
|
+
) -> Optional[sglang_scheduler_pb2.OutputLogProbs]:
|
513
|
+
"""Convert output logprobs dict to proto (no None values, plain floats)."""
|
514
|
+
if not logprobs_data:
|
515
|
+
return None
|
516
|
+
|
517
|
+
token_logprobs_val = logprobs_data.get("token_logprobs_val", [])
|
518
|
+
token_logprobs_idx = logprobs_data.get("token_logprobs_idx", [])
|
519
|
+
top_logprobs_val = logprobs_data.get("top_logprobs_val", [])
|
520
|
+
top_logprobs_idx = logprobs_data.get("top_logprobs_idx", [])
|
521
|
+
|
522
|
+
# Build TopLogProbs entries
|
523
|
+
top_logprobs_proto = []
|
524
|
+
if top_logprobs_val and top_logprobs_idx:
|
525
|
+
for val_list, idx_list in zip(top_logprobs_val, top_logprobs_idx):
|
526
|
+
top_logprobs_proto.append(
|
527
|
+
sglang_scheduler_pb2.TopLogProbs(
|
528
|
+
values=val_list,
|
529
|
+
token_ids=idx_list,
|
530
|
+
)
|
531
|
+
)
|
532
|
+
|
533
|
+
return sglang_scheduler_pb2.OutputLogProbs(
|
534
|
+
token_logprobs=token_logprobs_val, # Plain float array
|
535
|
+
token_ids=token_logprobs_idx,
|
536
|
+
top_logprobs=top_logprobs_proto,
|
537
|
+
)
|
538
|
+
|
539
|
+
def _convert_input_logprobs_to_proto(
|
540
|
+
self, logprobs_data: Dict
|
541
|
+
) -> Optional[sglang_scheduler_pb2.InputLogProbs]:
|
542
|
+
"""Convert input logprobs dict to proto (first token is None, wrapped in InputTokenLogProb)."""
|
543
|
+
if not logprobs_data:
|
544
|
+
return None
|
545
|
+
|
546
|
+
token_logprobs_val = logprobs_data.get("token_logprobs_val", [])
|
547
|
+
token_logprobs_idx = logprobs_data.get("token_logprobs_idx", [])
|
548
|
+
top_logprobs_val = logprobs_data.get("top_logprobs_val", [])
|
549
|
+
top_logprobs_idx = logprobs_data.get("top_logprobs_idx", [])
|
550
|
+
|
551
|
+
# Wrap values in InputTokenLogProb (None for first token, value for others)
|
552
|
+
token_logprobs_wrapped = [
|
553
|
+
(
|
554
|
+
sglang_scheduler_pb2.InputTokenLogProb()
|
555
|
+
if x is None
|
556
|
+
else sglang_scheduler_pb2.InputTokenLogProb(value=x)
|
557
|
+
)
|
558
|
+
for x in token_logprobs_val
|
559
|
+
]
|
560
|
+
|
561
|
+
# Build TopLogProbs entries
|
562
|
+
top_logprobs_proto = []
|
563
|
+
if top_logprobs_val and top_logprobs_idx:
|
564
|
+
for val_list, idx_list in zip(top_logprobs_val, top_logprobs_idx):
|
565
|
+
top_logprobs_proto.append(
|
566
|
+
sglang_scheduler_pb2.TopLogProbs(
|
567
|
+
values=val_list,
|
568
|
+
token_ids=idx_list,
|
569
|
+
)
|
570
|
+
)
|
571
|
+
|
572
|
+
return sglang_scheduler_pb2.InputLogProbs(
|
573
|
+
token_logprobs=token_logprobs_wrapped,
|
574
|
+
token_ids=token_logprobs_idx,
|
575
|
+
top_logprobs=top_logprobs_proto,
|
576
|
+
)
|
577
|
+
|
578
|
+
def _create_chunk_response(
|
579
|
+
self, request_id: str, output: Dict
|
580
|
+
) -> sglang_scheduler_pb2.GenerateResponse:
|
581
|
+
"""Create a streaming chunk response."""
|
582
|
+
meta_info = output.get("meta_info", {})
|
583
|
+
|
584
|
+
# Convert output logprobs if present
|
585
|
+
output_logprobs_proto = self._convert_output_logprobs_to_proto(
|
586
|
+
output.get("output_logprobs")
|
587
|
+
)
|
588
|
+
|
589
|
+
# Convert input logprobs if present (only in first chunk)
|
590
|
+
input_logprobs_proto = self._convert_input_logprobs_to_proto(
|
591
|
+
output.get("input_logprobs")
|
592
|
+
)
|
593
|
+
|
594
|
+
return sglang_scheduler_pb2.GenerateResponse(
|
595
|
+
request_id=request_id,
|
596
|
+
chunk=sglang_scheduler_pb2.GenerateStreamChunk(
|
597
|
+
token_ids=output.get("token_ids", []),
|
598
|
+
prompt_tokens=meta_info.get("prompt_tokens", 0),
|
599
|
+
completion_tokens=meta_info.get("completion_tokens", 0),
|
600
|
+
cached_tokens=meta_info.get("cached_tokens", 0),
|
601
|
+
output_logprobs=output_logprobs_proto,
|
602
|
+
input_logprobs=input_logprobs_proto,
|
603
|
+
index=output.get("index", 0),
|
604
|
+
),
|
605
|
+
)
|
606
|
+
|
607
|
+
def _create_completion_response(
|
608
|
+
self, request_id: str, output: Dict
|
609
|
+
) -> sglang_scheduler_pb2.GenerateResponse:
|
610
|
+
"""Create a completion response."""
|
611
|
+
|
612
|
+
# Extract meta info and finish reason details
|
613
|
+
meta_info = output.get("meta_info", {})
|
614
|
+
finish_reason_data = meta_info.get("finish_reason")
|
615
|
+
|
616
|
+
# Determine finish reason, default is stop
|
617
|
+
finish_reason = "stop"
|
618
|
+
if finish_reason_data:
|
619
|
+
if isinstance(finish_reason_data, dict):
|
620
|
+
finish_reason_type = finish_reason_data.get("type")
|
621
|
+
else:
|
622
|
+
# Handle legacy string format
|
623
|
+
finish_reason_type = finish_reason_data
|
624
|
+
|
625
|
+
if finish_reason_type == "length":
|
626
|
+
finish_reason = "length"
|
627
|
+
elif finish_reason_type == "abort":
|
628
|
+
finish_reason = "abort"
|
629
|
+
|
630
|
+
# Extract matched_stop information
|
631
|
+
matched_stop_kwargs = {}
|
632
|
+
if isinstance(finish_reason_data, dict) and "matched" in finish_reason_data:
|
633
|
+
matched = finish_reason_data["matched"]
|
634
|
+
if isinstance(matched, int):
|
635
|
+
matched_stop_kwargs["matched_token_id"] = matched
|
636
|
+
elif isinstance(matched, str):
|
637
|
+
matched_stop_kwargs["matched_stop_str"] = matched
|
638
|
+
|
639
|
+
# Convert output logprobs if present
|
640
|
+
output_logprobs_proto = self._convert_output_logprobs_to_proto(
|
641
|
+
output.get("output_logprobs")
|
642
|
+
)
|
643
|
+
|
644
|
+
# Convert input logprobs if present
|
645
|
+
input_logprobs_proto = self._convert_input_logprobs_to_proto(
|
646
|
+
output.get("input_logprobs")
|
647
|
+
)
|
648
|
+
|
649
|
+
return sglang_scheduler_pb2.GenerateResponse(
|
650
|
+
request_id=request_id,
|
651
|
+
complete=sglang_scheduler_pb2.GenerateComplete(
|
652
|
+
output_ids=output.get("token_ids", []),
|
653
|
+
finish_reason=finish_reason,
|
654
|
+
prompt_tokens=meta_info.get("prompt_tokens", 0),
|
655
|
+
completion_tokens=meta_info.get(
|
656
|
+
"completion_tokens", len(output.get("token_ids", []))
|
657
|
+
),
|
658
|
+
cached_tokens=meta_info.get("cached_tokens", 0),
|
659
|
+
output_logprobs=output_logprobs_proto,
|
660
|
+
input_logprobs=input_logprobs_proto,
|
661
|
+
index=output.get("index", 0),
|
662
|
+
**matched_stop_kwargs,
|
663
|
+
),
|
664
|
+
)
|
665
|
+
|
666
|
+
async def shutdown(self):
|
667
|
+
"""Shutdown the service."""
|
668
|
+
logger.info("Shutting down gRPC service")
|
669
|
+
|
670
|
+
# Shutdown request manager (handles its own tasks)
|
671
|
+
await self.request_manager.shutdown()
|
672
|
+
|
673
|
+
|
674
|
+
async def serve_grpc(
|
675
|
+
server_args: ServerArgs,
|
676
|
+
model_info: Optional[Dict] = None,
|
677
|
+
):
|
678
|
+
"""Start the standalone gRPC server with integrated scheduler."""
|
679
|
+
|
680
|
+
# Start bootstrap server BEFORE launching scheduler processes (only in PREFILL mode)
|
681
|
+
# This ensures the bootstrap server is ready when prefill schedulers try to register
|
682
|
+
bootstrap_server = None
|
683
|
+
if server_args.disaggregation_mode == "prefill":
|
684
|
+
bootstrap_server = start_disagg_service(server_args)
|
685
|
+
if bootstrap_server:
|
686
|
+
logger.info(
|
687
|
+
f"Bootstrap server started for disaggregation mode on {server_args.host}:{server_args.disaggregation_bootstrap_port}"
|
688
|
+
)
|
689
|
+
|
690
|
+
# Launch only the scheduler process(es) (no tokenizer/detokenizer needed for gRPC)
|
691
|
+
logger.info("Launching scheduler process(es)...")
|
692
|
+
scheduler_info, port_args, scheduler_procs = _launch_scheduler_process_only(
|
693
|
+
server_args=server_args,
|
694
|
+
)
|
695
|
+
|
696
|
+
# Update model info from scheduler info
|
697
|
+
if model_info is None:
|
698
|
+
model_info = {
|
699
|
+
"model_name": server_args.model_path,
|
700
|
+
"max_context_length": scheduler_info.get(
|
701
|
+
"max_total_num_tokens", server_args.context_length or 8192
|
702
|
+
),
|
703
|
+
"vocab_size": scheduler_info.get("vocab_size", 128256),
|
704
|
+
"supports_vision": scheduler_info.get("supports_vision", False),
|
705
|
+
"model_type": scheduler_info.get("model_type", "transformer"),
|
706
|
+
"max_req_input_len": scheduler_info.get("max_req_input_len", 8192),
|
707
|
+
"eos_token_ids": scheduler_info.get("eos_token_ids", []),
|
708
|
+
"pad_token_id": scheduler_info.get("pad_token_id", 0),
|
709
|
+
"bos_token_id": scheduler_info.get("bos_token_id", 1),
|
710
|
+
}
|
711
|
+
|
712
|
+
# Create request manager with the correct port args
|
713
|
+
# Note: We pass None for bootstrap_server since it's already started above
|
714
|
+
request_manager = GrpcRequestManager(
|
715
|
+
server_args=server_args,
|
716
|
+
port_args=port_args,
|
717
|
+
bootstrap_server=bootstrap_server,
|
718
|
+
)
|
719
|
+
|
720
|
+
# Create gRPC server
|
721
|
+
server = grpc.aio.server(
|
722
|
+
futures.ThreadPoolExecutor(max_workers=10),
|
723
|
+
options=[
|
724
|
+
("grpc.max_send_message_length", 1024 * 1024 * 256),
|
725
|
+
("grpc.max_receive_message_length", 1024 * 1024 * 256),
|
726
|
+
],
|
727
|
+
)
|
728
|
+
|
729
|
+
# Add service
|
730
|
+
servicer = SGLangSchedulerServicer(
|
731
|
+
request_manager=request_manager,
|
732
|
+
server_args=server_args,
|
733
|
+
model_info=model_info,
|
734
|
+
)
|
735
|
+
sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
|
736
|
+
|
737
|
+
# Enable reflection
|
738
|
+
SERVICE_NAMES = (
|
739
|
+
sglang_scheduler_pb2.DESCRIPTOR.services_by_name["SglangScheduler"].full_name,
|
740
|
+
reflection.SERVICE_NAME,
|
741
|
+
)
|
742
|
+
reflection.enable_server_reflection(SERVICE_NAMES, server)
|
743
|
+
|
744
|
+
# Start server
|
745
|
+
listen_addr = f"{server_args.host}:{server_args.port}"
|
746
|
+
server.add_insecure_port(listen_addr)
|
747
|
+
|
748
|
+
logger.info(f"Starting standalone gRPC server on {listen_addr}")
|
749
|
+
|
750
|
+
await server.start()
|
751
|
+
|
752
|
+
# Handle shutdown signals
|
753
|
+
loop = asyncio.get_running_loop()
|
754
|
+
stop_event = asyncio.Event()
|
755
|
+
|
756
|
+
def signal_handler():
|
757
|
+
logger.info("Received shutdown signal")
|
758
|
+
stop_event.set()
|
759
|
+
|
760
|
+
for sig in (signal.SIGTERM, signal.SIGINT):
|
761
|
+
loop.add_signal_handler(sig, signal_handler)
|
762
|
+
|
763
|
+
try:
|
764
|
+
await stop_event.wait()
|
765
|
+
finally:
|
766
|
+
logger.info("Shutting down gRPC server")
|
767
|
+
|
768
|
+
# Shutdown request manager first - this closes ZMQ sockets and stops background tasks
|
769
|
+
await servicer.shutdown()
|
770
|
+
|
771
|
+
# Stop the gRPC server
|
772
|
+
await server.stop(5.0)
|
773
|
+
|
774
|
+
# Terminate scheduler processes before exiting to avoid atexit hang
|
775
|
+
# The scheduler processes have SIGINT ignored, so they won't get KeyboardInterrupt
|
776
|
+
for i, proc in enumerate(scheduler_procs):
|
777
|
+
if proc.is_alive():
|
778
|
+
logger.info(f"Terminating scheduler process {i}...")
|
779
|
+
proc.terminate()
|
780
|
+
proc.join(timeout=2.0)
|
781
|
+
if proc.is_alive():
|
782
|
+
logger.warning(
|
783
|
+
f"Scheduler process {i} did not terminate, killing..."
|
784
|
+
)
|
785
|
+
proc.kill()
|
786
|
+
proc.join(timeout=1.0)
|
787
|
+
|
788
|
+
logger.info("All scheduler processes terminated")
|
789
|
+
|
790
|
+
|
791
|
+
def main():
|
792
|
+
"""Main entry point for standalone gRPC server."""
|
793
|
+
# Fix CUDA multiprocessing issues - must be called before any CUDA operations
|
794
|
+
mp.set_start_method("spawn", force=True)
|
795
|
+
|
796
|
+
parser = argparse.ArgumentParser(description="SGLang Standalone gRPC Server")
|
797
|
+
ServerArgs.add_cli_args(parser)
|
798
|
+
args = parser.parse_args()
|
799
|
+
server_args = ServerArgs.from_cli_args(args)
|
800
|
+
|
801
|
+
# Run server
|
802
|
+
asyncio.run(
|
803
|
+
serve_grpc(
|
804
|
+
server_args=server_args,
|
805
|
+
)
|
806
|
+
)
|
807
|
+
|
808
|
+
|
809
|
+
if __name__ == "__main__":
|
810
|
+
main()
|