PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

sglang/bench_one_batch.py +54 -37
sglang/bench_one_batch_server.py +340 -34
sglang/bench_serving.py +340 -159
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +20 -3
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +309 -0
sglang/srt/configs/load_config.py +33 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +284 -118
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +576 -0
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +6 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +26 -15
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +268 -98
sglang/srt/disaggregation/decode.py +172 -39
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +203 -555
sglang/srt/disaggregation/nixl/conn.py +217 -63
sglang/srt/disaggregation/prefill.py +113 -270
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +203 -97
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +85 -65
sglang/srt/entrypoints/grpc_server.py +632 -305
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +169 -17
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +327 -34
sglang/srt/entrypoints/openai/serving_base.py +74 -8
sglang/srt/entrypoints/openai/serving_chat.py +202 -118
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +20 -4
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +47 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +323 -0
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +21 -16
sglang/srt/function_call/glm4_moe_detector.py +4 -8
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +61 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +98 -7
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/grpc_request_manager.py +915 -0
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
sglang/srt/layers/activation.py +11 -7
sglang/srt/layers/attention/aiter_backend.py +17 -18
sglang/srt/layers/attention/ascend_backend.py +125 -10
sglang/srt/layers/attention/attention_registry.py +226 -0
sglang/srt/layers/attention/base_attn_backend.py +32 -4
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +52 -15
sglang/srt/layers/attention/flashinfer_backend.py +357 -212
sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
sglang/srt/layers/attention/flashmla_backend.py +9 -7
sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
sglang/srt/layers/attention/mamba/mamba.py +514 -1
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +23 -0
sglang/srt/layers/attention/nsa_backend.py +1201 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +249 -42
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +61 -3
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +19 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +28 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +47 -15
sglang/srt/layers/linear.py +30 -5
sglang/srt/layers/logits_processor.py +161 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
sglang/srt/layers/moe/ep_moe/layer.py +243 -448
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +27 -1
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +86 -20
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +43 -15
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +141 -81
sglang/srt/layers/quantization/mxfp4.py +17 -34
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -24
sglang/srt/layers/quantization/w8a8_int8.py +45 -27
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +750 -46
sglang/srt/layers/sampler.py +84 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +23 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +9 -4
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +33 -7
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +41 -17
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +83 -152
sglang/srt/managers/data_parallel_controller.py +156 -87
sglang/srt/managers/detokenizer_manager.py +51 -24
sglang/srt/managers/io_struct.py +223 -129
sglang/srt/managers/mm_utils.py +49 -10
sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +130 -0
sglang/srt/managers/schedule_batch.py +340 -529
sglang/srt/managers/schedule_policy.py +158 -18
sglang/srt/managers/scheduler.py +665 -620
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
sglang/srt/managers/tokenizer_manager.py +462 -226
sglang/srt/managers/tp_worker.py +217 -156
sglang/srt/managers/utils.py +79 -47
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +42 -28
sglang/srt/mem_cache/base_prefix_cache.py +3 -3
sglang/srt/mem_cache/chunk_cache.py +20 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +38 -0
sglang/srt/mem_cache/hicache_storage.py +44 -2
sglang/srt/mem_cache/hiradix_cache.py +134 -34
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +602 -208
sglang/srt/mem_cache/memory_pool_host.py +134 -183
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +263 -78
sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +115 -58
sglang/srt/metrics/collector.py +113 -120
sglang/srt/metrics/func_timer.py +3 -8
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +81 -36
sglang/srt/model_executor/forward_batch_info.py +40 -50
sglang/srt/model_executor/model_runner.py +507 -319
sglang/srt/model_executor/npu_graph_runner.py +11 -5
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +438 -37
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +200 -27
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +40 -56
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +25 -4
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +793 -235
sglang/srt/models/dots_ocr.py +171 -0
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +570 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -3
sglang/srt/models/glm4_moe.py +17 -40
sglang/srt/models/glm4_moe_nextn.py +4 -4
sglang/srt/models/glm4v.py +3 -2
sglang/srt/models/glm4v_moe.py +6 -6
sglang/srt/models/gpt_oss.py +12 -35
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +4 -2
sglang/srt/models/llama.py +6 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +6 -23
sglang/srt/models/longcat_flash_nextn.py +4 -15
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +27 -6
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +5 -5
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +70 -4
sglang/srt/models/qwen2_vl.py +6 -3
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +50 -38
sglang/srt/models/qwen3_next.py +43 -21
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +791 -0
sglang/srt/models/qwen3_vl_moe.py +343 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +268 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +61 -0
sglang/srt/multimodal/processors/base_processor.py +21 -9
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +2 -4
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +20 -10
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +83 -17
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +36 -23
sglang/srt/sampling/sampling_params.py +75 -0
sglang/srt/server_args.py +1300 -338
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +161 -0
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
sglang/srt/speculative/eagle_info.py +786 -0
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +113 -1270
sglang/srt/speculative/eagle_worker.py +120 -285
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/ngram_info.py +433 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +49 -0
sglang/srt/speculative/spec_utils.py +641 -0
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +35 -18
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/{utils.py → utils/common.py} +583 -113
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/profile_merger.py +199 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/get_logits_ut.py +57 -0
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +120 -11
sglang/test/runners.py +3 -1
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +8 -2
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +3 -4
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +430 -0
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +93 -1
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +432 -16
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
sglang/srt/entrypoints/grpc_request_manager.py +0 -580
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/srt/grpc/grpc_request_manager.py ADDED Viewed

@@ -0,0 +1,915 @@
+"""
+gRPC Request Manager - Orchestrates request lifecycle without tokenization.
+Mimics TokenizerManager's state management and ZMQ communication patterns.
+"""
+import asyncio
+import copy
+import dataclasses
+import logging
+import os
+import signal
+import sys
+import threading
+import time
+import uuid
+from typing import Any, AsyncGenerator, Dict, List, Optional, Union
+import grpc
+import zmq
+import zmq.asyncio
+from sglang.srt.managers.io_struct import (
+    AbortReq,
+    BatchEmbeddingOutput,
+    BatchTokenIDOutput,
+    HealthCheckOutput,
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+)
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import get_zmq_socket, kill_process_tree
+from sglang.utils import get_exception_traceback
+logger = logging.getLogger(__name__)
+class GrpcSignalHandler:
+    """Minimal signal handler for gRPC server - delegates real crash handling to scheduler."""
+    def __init__(self, grpc_manager):
+        self.grpc_manager = grpc_manager
+    def sigterm_handler(self, signum=None, frame=None):
+        """Handle SIGTERM by gracefully shutting down gRPC server."""
+        logger.warning(
+            f"SIGTERM received. {signum=} {frame=}. Shutting down gRPC server..."
+        )
+        self.grpc_manager.gracefully_exit = True
+    def running_phase_sigquit_handler(self, signum=None, frame=None):
+        """Handle SIGQUIT from failed scheduler process."""
+        logger.error(
+            "Received SIGQUIT from scheduler process. Scheduler failed, shutting down gRPC server."
+        )
+        logger.info(
+            "Note: Crash dumps are handled by the scheduler process, not the gRPC server."
+        )
+        # Just exit cleanly - the scheduler handles crash dumps
+        kill_process_tree(os.getpid(), include_parent=True)
+@dataclasses.dataclass
+class GrpcReqState:
+    """State tracking for a gRPC request."""
+    # Request identification
+    request_id: str
+    grpc_context: Optional[grpc.aio.ServicerContext]
+    # Communication
+    out_queue: asyncio.Queue
+    finished: bool
+    event: asyncio.Event
+    obj: Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]
+    # Metrics (same as TokenizerManager's ReqState)
+    created_time: float
+    finished_time: float = 0.0
+    first_token_time: float = 0.0
+    last_time: float = 0.0
+    last_completion_tokens: int = 1
+    # Streaming state
+    stream_finished: bool = False
+    input_logprobs_sent: bool = False  # Track if input logprobs were sent in streaming
+    # Token accumulation (for non-streaming)
+    output_ids: List[int] = dataclasses.field(default_factory=list)
+    input_token_logprobs_val: List[float] = dataclasses.field(default_factory=list)
+    input_token_logprobs_idx: List[int] = dataclasses.field(default_factory=list)
+    output_token_logprobs_val: List[float] = dataclasses.field(default_factory=list)
+    output_token_logprobs_idx: List[int] = dataclasses.field(default_factory=list)
+    input_top_logprobs_val: List[List[float]] = dataclasses.field(default_factory=list)
+    input_top_logprobs_idx: List[List[int]] = dataclasses.field(default_factory=list)
+    output_top_logprobs_val: List[List[float]] = dataclasses.field(default_factory=list)
+    output_top_logprobs_idx: List[List[int]] = dataclasses.field(default_factory=list)
+    # Session state
+    session_id: Optional[str] = None
+    is_session_request: bool = False
+class GrpcRequestManager:
+    """
+    Manages gRPC request lifecycle, mimicking TokenizerManager's orchestration
+    behaviors without tokenization.
+    """
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        bootstrap_server=None,
+    ):
+        """Initialize the gRPC request manager."""
+        self.server_args = server_args
+        self.port_args = port_args
+        # ZMQ Communication Setup (same pattern as TokenizerManager)
+        self.context = zmq.asyncio.Context(2)
+        # Socket for receiving outputs from scheduler
+        self.recv_from_scheduler = get_zmq_socket(
+            self.context, zmq.PULL, port_args.detokenizer_ipc_name, bind=True
+        )
+        # Socket for sending requests to scheduler
+        self.send_to_scheduler = get_zmq_socket(
+            self.context, zmq.PUSH, port_args.scheduler_input_ipc_name, bind=True
+        )
+        # State Management (from TokenizerManager)
+        self.rid_to_state: Dict[str, GrpcReqState] = {}
+        self.asyncio_tasks: set = set()
+        self.gracefully_exit = False
+        self.no_create_loop = False
+        self.event_loop = None
+        # Pause/Resume Control
+        self.is_pause = False
+        self.is_pause_cond = asyncio.Condition()
+        # Metrics
+        self.last_receive_tstamp = time.time()
+        # Crash dump for debugging
+        self.crash_dump_request_list = []
+        self.crash_dump_performed = False
+        # Bootstrap server (passed from serve_grpc, not started here)
+        self.bootstrap_server = bootstrap_server
+        logger.info(
+            f"GrpcRequestManager initialized with ZMQ IPC: "
+            f"recv={port_args.detokenizer_ipc_name}, "
+            f"send={port_args.scheduler_input_ipc_name}"
+        )
+        if self.bootstrap_server:
+            logger.info(
+                f"Bootstrap server initialized for disaggregation mode: "
+                f"{server_args.disaggregation_mode}"
+            )
+    async def generate_request(
+        self,
+        obj: TokenizedGenerateReqInput,
+        request_id: Optional[str] = None,
+        grpc_context: Optional[grpc.aio.ServicerContext] = None,
+    ) -> AsyncGenerator[Union[Dict, List[Dict]], None]:
+        """
+        Submit a generation request to the scheduler with n>1 parallel sampling support.
+        This method implements the same two-phase approach as tokenizer_manager.py:
+        1. Phase 1: Send prefix caching request (max_new_tokens=0)
+        2. Phase 2: Send n generation requests that reuse the cached prefix
+        Yields individual responses for streaming, or aggregated responses for non-streaming.
+        """
+        n = getattr(obj.sampling_params, "n", 1)
+        if n <= 1:
+            async for response in self._handle_single_request(
+                obj, request_id, grpc_context
+            ):
+                yield response
+            return
+        # N>1 handling - two-phase approach
+        logger.debug(f"Multiple sampling request (n={n}), using two-phase approach")
+        # Generate base request ID if not provided
+        if request_id is None:
+            base_request_id = f"grpc-{uuid.uuid4().hex}"
+        else:
+            base_request_id = request_id
+        # Phase 1: Cache the common prefix
+        logger.debug(f"Phase 1: Caching prefix for request {base_request_id}")
+        prefix_obj = copy.copy(obj)
+        prefix_obj.sampling_params = copy.copy(obj.sampling_params)
+        prefix_obj.sampling_params.max_new_tokens = 0  # Prefill-only
+        prefix_obj.sampling_params.n = 1  # Don't replicate prefix request
+        # Send prefix caching request and consume response
+        async for _ in self._handle_single_request(
+            prefix_obj, f"{base_request_id}-prefix", grpc_context
+        ):
+            # Consume prefix response (usually just one chunk with finish_reason)
+            pass
+        logger.debug(f"Phase 1 completed: Prefix cached for {base_request_id}")
+        # Phase 2: Generate n parallel requests
+        logger.debug(f"Phase 2: Generating {n} parallel requests")
+        generators = []
+        request_ids = []
+        for i in range(n):
+            # Create individual generation request
+            gen_obj = copy.copy(obj)
+            gen_obj.sampling_params = copy.copy(obj.sampling_params)
+            gen_obj.sampling_params.n = 1  # Each request generates 1 response
+            gen_request_id = f"{base_request_id}-{i}"
+            request_ids.append(gen_request_id)
+            # Start generation request
+            generators.append(
+                self._handle_single_request(gen_obj, gen_request_id, grpc_context)
+            )
+        # Handle response aggregation
+        is_stream = getattr(obj, "stream", False)
+        if not is_stream:
+            # Non-streaming: collect all responses and return as batch
+            logger.debug(f"Non-streaming mode: collecting {n} responses")
+            responses = []
+            for generator in generators:
+                async for response in generator:
+                    responses.append(response)
+            yield responses  # Return all responses as a batch
+        else:
+            # Streaming mode: multiplex responses with index for ordering
+            logger.debug(f"Streaming mode: multiplexing {n} streams")
+            rid_to_index = {rid: i for i, rid in enumerate(request_ids)}
+            # Create async tasks for all generators
+            task_map = {}
+            for generator in generators:
+                task = asyncio.create_task(generator.__anext__())
+                task_map[task] = generator
+            # Process responses as they arrive
+            while task_map:
+                done, _ = await asyncio.wait(
+                    task_map.keys(), return_when=asyncio.FIRST_COMPLETED
+                )
+                for task in done:
+                    generator = task_map.pop(task)
+                    try:
+                        response = await task
+                        # Add index for client-side ordering
+                        if isinstance(response, dict):
+                            response_rid = response.get("request_id", "")
+                            if response_rid in rid_to_index:
+                                response["index"] = rid_to_index[response_rid]
+                        yield response
+                        # Create next task for this generator
+                        next_task = asyncio.create_task(generator.__anext__())
+                        task_map[next_task] = generator
+                    except StopAsyncIteration:
+                        # This generator is finished
+                        pass
+    async def _handle_single_request(
+        self,
+        obj: TokenizedGenerateReqInput,
+        request_id: Optional[str] = None,
+        grpc_context: Optional[grpc.aio.ServicerContext] = None,
+    ):
+        """Handle a single request - core implementation without n>1 logic."""
+        # Generate request ID if not provided
+        if request_id is None:
+            request_id = f"grpc-{uuid.uuid4().hex}"
+        obj.rid = request_id
+        # Create and register request state
+        # TODO: support log_request
+        state = GrpcReqState(
+            request_id=request_id,
+            grpc_context=grpc_context,
+            out_queue=asyncio.Queue(),
+            finished=False,
+            event=asyncio.Event(),
+            obj=obj,
+            created_time=time.time(),
+        )
+        # Track session if needed
+        if hasattr(obj, "session_params") and obj.session_params:
+            state.session_id = obj.session_params.session_id
+            state.is_session_request = True
+        self.rid_to_state[request_id] = state
+        self.record_request_for_crash_dump(obj)
+        try:
+            # Send to scheduler - let exceptions bubble up to grpc_server.py
+            await self._send_to_scheduler(obj)
+            is_stream = getattr(obj, "stream", False)
+            while True:
+                try:
+                    response = await state.out_queue.get()
+                    if is_stream:
+                        yield response
+                    # Non-streaming: yield final response with accumulated tokens from state
+                    if isinstance(response, dict) and response.get("finished", False):
+                        if not is_stream:
+                            final_response = response.copy()
+                            final_response["token_ids"] = state.output_ids
+                            yield final_response
+                        break
+                except asyncio.CancelledError:
+                    # Task was cancelled by gRPC framework when client disconnected
+                    logger.info(f"Request {request_id} cancelled by client")
+                    await self.abort_request(request_id)
+                    raise  # Re-raise to let gRPC server handle cleanup
+        finally:
+            # Always clean up request state when exiting
+            self._cleanup_request_state(request_id)
+    def _cleanup_request_state(self, request_id: str):
+        """Clean up local request state (does not notify scheduler)."""
+        if request_id in self.rid_to_state:
+            del self.rid_to_state[request_id]
+    async def embedding_request(
+        self,
+        obj: TokenizedEmbeddingReqInput,
+        request_id: Optional[str] = None,
+    ) -> asyncio.Future:
+        """
+        Submit an embedding request to the scheduler.
+        Returns a future that will contain the embedding result.
+        """
+        # Generate request ID if not provided
+        if request_id is None:
+            request_id = f"grpc-embed-{uuid.uuid4().hex}"
+        obj.rid = request_id
+        # Create request state
+        state = GrpcReqState(
+            request_id=request_id,
+            grpc_context=None,
+            out_queue=asyncio.Queue(),
+            finished=False,
+            event=asyncio.Event(),
+            obj=obj,
+            created_time=time.time(),
+        )
+        # Register state
+        self.rid_to_state[request_id] = state
+        # Create future for result
+        future = asyncio.Future()
+        # Send to scheduler
+        try:
+            await self._send_to_scheduler(obj)
+        except Exception as e:
+            del self.rid_to_state[request_id]
+            future.set_exception(e)
+            return future
+        # Wait for result in background
+        async def wait_for_result():
+            try:
+                await state.event.wait()
+                result = await state.out_queue.get()
+                future.set_result(result)
+            except Exception as e:
+                future.set_exception(e)
+            finally:
+                # Clean up
+                if request_id in self.rid_to_state:
+                    del self.rid_to_state[request_id]
+        asyncio.create_task(wait_for_result())
+        return future
+    async def abort_request(self, request_id: str) -> bool:
+        """Abort a running request.
+        Sends abort request to scheduler and marks local state as finished
+        to stop processing any further outputs from the scheduler.
+        """
+        # Skip aborting health check requests (they clean themselves up)
+        if request_id.startswith("HEALTH_CHECK"):
+            return False
+        # Mark state as finished immediately to stop processing scheduler outputs
+        state = self.rid_to_state.get(request_id)
+        if state:
+            state.finished = True
+            state.stream_finished = True
+            logger.debug(f"Marked request {request_id} as aborted locally")
+        # Send abort to scheduler - the scheduler will send AbortReq back
+        # which will be handled by _handle_abort_req
+        abort_req = AbortReq(rid=request_id)
+        try:
+            await self._send_to_scheduler(abort_req)
+            logger.debug(f"Sent abort to scheduler for request {request_id}")
+        except Exception as e:
+            logger.error(f"Failed to send abort request to scheduler: {e}")
+            return False
+        return True
+    async def handle_loop(self):
+        """
+        Main event loop - processes outputs from scheduler.
+        Mimics TokenizerManager's handle_loop.
+        """
+        while not self.gracefully_exit:
+            try:
+                # Receive from scheduler
+                recv_obj = await self.recv_from_scheduler.recv_pyobj()
+                self.last_receive_tstamp = time.time()
+                # Check for pause (optimized: check flag before acquiring lock)
+                if self.is_pause:
+                    async with self.is_pause_cond:
+                        while self.is_pause:
+                            await self.is_pause_cond.wait()
+                # Handle different output types
+                if isinstance(recv_obj, BatchTokenIDOutput):
+                    await self._handle_batch_output(recv_obj)
+                elif isinstance(recv_obj, BatchEmbeddingOutput):
+                    await self._handle_embedding_output(recv_obj)
+                elif isinstance(recv_obj, HealthCheckOutput):
+                    await self._handle_health_check_output(recv_obj)
+                elif isinstance(recv_obj, AbortReq):
+                    await self._handle_abort_req(recv_obj)
+                else:
+                    logger.warning(f"Unknown output type: {type(recv_obj)}")
+            except zmq.error.Again:
+                # Timeout, check if we should exit
+                if self.gracefully_exit:
+                    break
+                continue
+            except zmq.error.ZMQError as e:
+                # Socket closed or other ZMQ error - exit cleanly if shutting down
+                if self.gracefully_exit:
+                    logger.debug(f"ZMQ recv interrupted during shutdown: {e}")
+                    break
+                logger.error(
+                    f"ZMQ error in handle loop: {e}\n{get_exception_traceback()}"
+                )
+                break
+            except Exception as e:
+                logger.error(f"Handle loop error: {e}\n{get_exception_traceback()}")
+                if self.gracefully_exit:
+                    break
+    def _convert_logprob_style(
+        self,
+        state: GrpcReqState,
+        batch_out: BatchTokenIDOutput,
+        batch_index: int,
+    ):
+        """
+        Convert and accumulate logprobs from batch output to state.
+        Follows the same logic as tokenizer_manager.convert_logprob_style.
+        """
+        # Early exit if no input logprobs at all
+        if batch_out.input_token_logprobs_val is None:
+            return
+        # Accumulate input token logprobs (only if list is non-empty)
+        if len(batch_out.input_token_logprobs_val) > 0:
+            state.input_token_logprobs_val.extend(
+                batch_out.input_token_logprobs_val[batch_index]
+            )
+            state.input_token_logprobs_idx.extend(
+                batch_out.input_token_logprobs_idx[batch_index]
+            )
+        # Always accumulate output token logprobs
+        state.output_token_logprobs_val.extend(
+            batch_out.output_token_logprobs_val[batch_index]
+        )
+        state.output_token_logprobs_idx.extend(
+            batch_out.output_token_logprobs_idx[batch_index]
+        )
+        # Handle top logprobs if requested
+        if state.obj.top_logprobs_num > 0:
+            # Accumulate input top logprobs (only if list is non-empty)
+            if len(batch_out.input_top_logprobs_val) > 0:
+                state.input_top_logprobs_val.extend(
+                    batch_out.input_top_logprobs_val[batch_index]
+                )
+                state.input_top_logprobs_idx.extend(
+                    batch_out.input_top_logprobs_idx[batch_index]
+                )
+            # Always accumulate output top logprobs
+            state.output_top_logprobs_val.extend(
+                batch_out.output_top_logprobs_val[batch_index]
+            )
+            state.output_top_logprobs_idx.extend(
+                batch_out.output_top_logprobs_idx[batch_index]
+            )
+    async def _handle_batch_output(self, batch_out: BatchTokenIDOutput):
+        """Handle batch generation output from scheduler."""
+        # Collect all queue.put() tasks for parallel execution
+        put_tasks = []
+        cleanup_tasks = []
+        now = time.time()
+        # Process each request in the batch
+        for i, rid in enumerate(batch_out.rids):
+            if rid not in self.rid_to_state:
+                continue
+            state = self.rid_to_state[rid]
+            # Skip if already aborted/finished locally (client cancelled)
+            if state.finished:
+                logger.debug(f"Skipping output for aborted request {rid}")
+                continue
+            # Update metrics
+            if state.first_token_time == 0.0:
+                state.first_token_time = now
+            state.last_time = now
+            # Extract output for this request
+            output_data = {
+                "request_id": rid,
+                "token_ids": batch_out.output_ids[i] if batch_out.output_ids else [],
+                "finished": batch_out.finished_reasons[i] is not None,
+                "meta_info": {
+                    "prompt_tokens": (
+                        batch_out.prompt_tokens[i] if batch_out.prompt_tokens else 0
+                    ),
+                    "completion_tokens": (
+                        batch_out.completion_tokens[i]
+                        if batch_out.completion_tokens
+                        else 0
+                    ),
+                    "cached_tokens": (
+                        batch_out.cached_tokens[i] if batch_out.cached_tokens else 0
+                    ),
+                    "finish_reason": (
+                        batch_out.finished_reasons[i]
+                        if batch_out.finished_reasons[i]
+                        else None
+                    ),
+                },
+            }
+            # Accumulate logprobs (following tokenizer_manager pattern)
+            if state.obj.return_logprob:
+                self._convert_logprob_style(state, batch_out, i)
+            # Send input logprobs based if available
+            if (
+                state.obj.return_logprob
+                and state.obj.logprob_start_len >= 0
+                and state.input_token_logprobs_val
+            ):
+                if state.obj.stream and not state.input_logprobs_sent:
+                    # Streaming: send input logprobs once in first chunk that has them
+                    output_data["input_logprobs"] = {
+                        "token_logprobs_val": state.input_token_logprobs_val,
+                        "token_logprobs_idx": state.input_token_logprobs_idx,
+                        "top_logprobs_val": state.input_top_logprobs_val,
+                        "top_logprobs_idx": state.input_top_logprobs_idx,
+                    }
+                    state.input_logprobs_sent = True
+                elif not state.obj.stream and output_data["finished"]:
+                    # Non-streaming: send input logprobs in final chunk
+                    output_data["input_logprobs"] = {
+                        "token_logprobs_val": state.input_token_logprobs_val,
+                        "token_logprobs_idx": state.input_token_logprobs_idx,
+                        "top_logprobs_val": state.input_top_logprobs_val,
+                        "top_logprobs_idx": state.input_top_logprobs_idx,
+                    }
+            # Send output logprobs if available
+            if (
+                state.obj.return_logprob
+                and batch_out.output_token_logprobs_val
+                and i < len(batch_out.output_token_logprobs_val)
+            ):
+                if state.obj.stream:
+                    # For streaming: send incremental logprobs (only new tokens in this chunk)
+                    # NOTE: this is different than TokenizerManager, which always accumulates
+                    def get_part(attr_name):
+                        source_list = getattr(batch_out, attr_name, None)
+                        return (
+                            source_list[i]
+                            if source_list and i < len(source_list)
+                            else []
+                        )
+                    output_data["output_logprobs"] = {
+                        "token_logprobs_val": batch_out.output_token_logprobs_val[i],
+                        "token_logprobs_idx": get_part("output_token_logprobs_idx"),
+                        "top_logprobs_val": get_part("output_top_logprobs_val"),
+                        "top_logprobs_idx": get_part("output_top_logprobs_idx"),
+                    }
+                elif output_data["finished"]:
+                    # Non-streaming: send cumulative output logprobs in final chunk
+                    output_data["output_logprobs"] = {
+                        "token_logprobs_val": state.output_token_logprobs_val,
+                        "token_logprobs_idx": state.output_token_logprobs_idx,
+                        "top_logprobs_val": state.output_top_logprobs_val,
+                        "top_logprobs_idx": state.output_top_logprobs_idx,
+                    }
+            # Update state for accumulation
+            if output_data["token_ids"]:
+                state.output_ids.extend(output_data["token_ids"])
+            # Add queue.put() to parallel task list
+            put_tasks.append(state.out_queue.put(output_data))
+            # Handle completion
+            if output_data["finished"]:
+                state.finished = True
+                state.finished_time = now
+                state.stream_finished = True
+                state.event.set()
+                # Remove from tracking after a delay
+                async def cleanup(request_id):
+                    await asyncio.sleep(5.0)
+                    if request_id in self.rid_to_state:
+                        del self.rid_to_state[request_id]
+                cleanup_tasks.append(asyncio.create_task(cleanup(rid)))
+        # Execute all queue.put() operations in parallel
+        if put_tasks:
+            await asyncio.gather(*put_tasks, return_exceptions=True)
+    async def _handle_embedding_output(self, batch_out: BatchEmbeddingOutput):
+        """Handle batch embedding output from scheduler."""
+        for i, rid in enumerate(batch_out.rids):
+            if rid not in self.rid_to_state:
+                continue
+            state = self.rid_to_state[rid]
+            # Create result
+            result = {
+                "request_id": rid,
+                "embedding": batch_out.embeddings[i],
+                "prompt_tokens": (
+                    batch_out.prompt_tokens[i] if batch_out.prompt_tokens else 0
+                ),
+                "finish_reason": (
+                    batch_out.finish_reason[i] if batch_out.finish_reason else None
+                ),
+            }
+            # Send result
+            await state.out_queue.put(result)
+            # Mark as finished
+            state.finished = True
+            state.finished_time = time.time()
+            state.event.set()
+    async def _handle_health_check_output(self, health_out: HealthCheckOutput):
+        """Handle health check output from scheduler."""
+        rid = health_out.rid
+        if rid not in self.rid_to_state:
+            logger.warning(f"Health check output for unknown request: {rid}")
+            return
+        state = self.rid_to_state[rid]
+        # Create health check result
+        result = {
+            "request_id": rid,
+            "healthy": True,  # If we got a response, scheduler is healthy
+            "output_text": (
+                health_out.output_str if hasattr(health_out, "output_str") else ""
+            ),
+            "finish_reason": (
+                health_out.finish_reason
+                if hasattr(health_out, "finish_reason")
+                else "stop"
+            ),
+        }
+        # Send result
+        await state.out_queue.put(result)
+        # Mark as finished
+        state.finished = True
+        state.finished_time = time.time()
+        state.event.set()
+    async def _handle_abort_req(self, recv_obj: AbortReq):
+        """Handle abort request from scheduler.
+        The scheduler sends AbortReq back to notify us that a request was aborted,
+        either due to explicit abort_request() call or scheduler-initiated abort
+        (priority preemption, queue full, KV cache pressure, etc).
+        """
+        # Skip health check requests
+        if recv_obj.rid.startswith("HEALTH_CHECK"):
+            return
+        # Check if request still exists
+        if recv_obj.rid not in self.rid_to_state:
+            logger.debug(
+                f"Abort request for {recv_obj.rid} not in local state (may have already finished or not started yet)"
+            )
+            return
+        state = self.rid_to_state[recv_obj.rid]
+        # Mark as finished
+        state.finished = True
+        state.stream_finished = True
+        # Create abort response
+        if recv_obj.finished_reason:
+            # Scheduler provided a specific finish reason (e.g., priority preemption, queue full)
+            abort_response = {
+                "request_id": recv_obj.rid,
+                "error": recv_obj.finished_reason.get("message", "Request aborted"),
+                "finished": True,
+                "meta_info": {
+                    "id": recv_obj.rid,
+                    "finish_reason": recv_obj.finished_reason,
+                },
+            }
+        else:
+            # Generic abort (e.g., explicit abort_request call)
+            abort_response = {
+                "request_id": recv_obj.rid,
+                "error": "Request aborted",
+                "finished": True,
+                "meta_info": {
+                    "id": recv_obj.rid,
+                    "finish_reason": {
+                        "type": "abort",
+                        "message": "Abort before prefill",
+                    },
+                    "prompt_tokens": 0,
+                    "completion_tokens": 0,
+                },
+            }
+        # Send abort notification to output queue
+        await state.out_queue.put(abort_response)
+        # Wake up any waiting coroutines
+        state.event.set()
+        logger.debug(f"Handled abort request for {recv_obj.rid}")
+    async def _send_to_scheduler(self, obj):
+        """Send an object to the scheduler via ZMQ."""
+        try:
+            self.send_to_scheduler.send_pyobj(obj)
+        except Exception as e:
+            logger.error(f"Failed to send to scheduler: {e}")
+            raise
+    def record_request_for_crash_dump(self, obj):
+        """Record request for potential crash dump."""
+        if len(self.crash_dump_request_list) < 100:
+            self.crash_dump_request_list.append(
+                {
+                    "time": time.time(),
+                    "request_id": getattr(obj, "rid", "unknown"),
+                    "type": type(obj).__name__,
+                }
+            )
+    async def shutdown(self):
+        """Gracefully shutdown the request manager."""
+        logger.info("Shutting down GrpcRequestManager")
+        self.gracefully_exit = True
+        # Cancel all asyncio tasks FIRST - this will interrupt blocked recv() calls
+        for task in list(self.asyncio_tasks):
+            if not task.done():
+                task.cancel()
+        # Give tasks a moment to process cancellation
+        if self.asyncio_tasks:
+            await asyncio.gather(*list(self.asyncio_tasks), return_exceptions=True)
+        # Cancel all pending requests
+        for rid, state in list(self.rid_to_state.items()):
+            if not state.finished:
+                await state.out_queue.put(
+                    {"error": "Server shutting down", "shutdown": True}
+                )
+                state.finished = True
+                state.event.set()
+        # Wait for tasks to complete
+        if self.asyncio_tasks:
+            await asyncio.gather(*list(self.asyncio_tasks), return_exceptions=True)
+        # Shutdown bootstrap server if running
+        if self.bootstrap_server:
+            logger.info("Shutting down bootstrap server")
+            try:
+                if hasattr(self.bootstrap_server, "shutdown"):
+                    if asyncio.iscoroutinefunction(self.bootstrap_server.shutdown):
+                        await self.bootstrap_server.shutdown()
+                    else:
+                        self.bootstrap_server.shutdown()
+            except Exception as e:
+                logger.warning(f"Error shutting down bootstrap server: {e}")
+        # Close ZMQ sockets
+        self.recv_from_scheduler.close()
+        self.send_to_scheduler.close()
+        # Terminate the ZMQ context - this is critical for asyncio loop to exit cleanly
+        self.context.term()
+        logger.info("GrpcRequestManager shutdown complete")
+    def get_server_info(self) -> Dict[str, Any]:
+        """Get server information for health checks."""
+        return {
+            "active_requests": len(self.rid_to_state),
+            "paused": self.is_pause,
+            "last_receive_time": self.last_receive_tstamp,
+        }
+    def auto_create_handle_loop(self):
+        """Automatically create and start the handle_loop task, matching TokenizerManager pattern."""
+        if self.no_create_loop:
+            return
+        self.no_create_loop = True
+        loop = asyncio.get_event_loop()
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.handle_loop))
+        )
+        self.event_loop = loop
+        # We cannot add signal handler when the grpc manager is not in
+        # the main thread due to the CPython limitation.
+        if threading.current_thread() is threading.main_thread():
+            signal_handler = GrpcSignalHandler(self)
+            loop.add_signal_handler(signal.SIGTERM, signal_handler.sigterm_handler)
+            # Update the signal handler for the process. It overrides the sigquit handler in the launch phase.
+            loop.add_signal_handler(
+                signal.SIGQUIT, signal_handler.running_phase_sigquit_handler
+            )
+        else:
+            logger.warning(
+                "Signal handler is not added because the grpc request manager is "
+                "not in the main thread. This disables graceful shutdown of the "
+                "grpc request manager when SIGTERM is received."
+            )
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.sigterm_watchdog))
+        )
+    async def sigterm_watchdog(self):
+        """Watchdog to handle SIGTERM gracefully, matching TokenizerManager pattern."""
+        while not self.gracefully_exit:
+            await asyncio.sleep(1.0)
+async def print_exception_wrapper(func):
+    """
+    Sometimes an asyncio function does not print exception.
+    We do another wrapper to handle the exception.
+    """
+    try:
+        await func()
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"GrpcRequestManager hit an exception: {traceback}")
+        if hasattr(func, "__self__") and isinstance(func.__self__, GrpcRequestManager):
+            func.__self__.dump_requests_before_crash()
+        kill_process_tree(os.getpid(), include_parent=True)
+        sys.exit(1)

sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl