sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
- sglang/bench_one_batch_server.py +340 -34
- sglang/bench_serving.py +340 -159
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +9 -2
- sglang/profiler.py +20 -3
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +309 -0
- sglang/srt/configs/load_config.py +33 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +284 -118
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +576 -0
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +6 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +26 -15
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +268 -98
- sglang/srt/disaggregation/decode.py +172 -39
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +203 -555
- sglang/srt/disaggregation/nixl/conn.py +217 -63
- sglang/srt/disaggregation/prefill.py +113 -270
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +203 -97
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +85 -65
- sglang/srt/entrypoints/grpc_server.py +632 -305
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +169 -17
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +327 -34
- sglang/srt/entrypoints/openai/serving_base.py +74 -8
- sglang/srt/entrypoints/openai/serving_chat.py +202 -118
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +20 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +47 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +323 -0
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +21 -16
- sglang/srt/function_call/glm4_moe_detector.py +4 -8
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +61 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +98 -7
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/grpc_request_manager.py +915 -0
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
- sglang/srt/layers/activation.py +11 -7
- sglang/srt/layers/attention/aiter_backend.py +17 -18
- sglang/srt/layers/attention/ascend_backend.py +125 -10
- sglang/srt/layers/attention/attention_registry.py +226 -0
- sglang/srt/layers/attention/base_attn_backend.py +32 -4
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +52 -15
- sglang/srt/layers/attention/flashinfer_backend.py +357 -212
- sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
- sglang/srt/layers/attention/flashmla_backend.py +9 -7
- sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
- sglang/srt/layers/attention/mamba/mamba.py +514 -1
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +23 -0
- sglang/srt/layers/attention/nsa_backend.py +1201 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +249 -42
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
- sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +61 -3
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +19 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +28 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +47 -15
- sglang/srt/layers/linear.py +30 -5
- sglang/srt/layers/logits_processor.py +161 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
- sglang/srt/layers/moe/ep_moe/layer.py +243 -448
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
- sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +27 -1
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +86 -20
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +43 -15
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +141 -81
- sglang/srt/layers/quantization/mxfp4.py +17 -34
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -24
- sglang/srt/layers/quantization/w8a8_int8.py +45 -27
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +750 -46
- sglang/srt/layers/sampler.py +84 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +23 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +9 -4
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +33 -7
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +41 -17
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +83 -152
- sglang/srt/managers/data_parallel_controller.py +156 -87
- sglang/srt/managers/detokenizer_manager.py +51 -24
- sglang/srt/managers/io_struct.py +223 -129
- sglang/srt/managers/mm_utils.py +49 -10
- sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +130 -0
- sglang/srt/managers/schedule_batch.py +340 -529
- sglang/srt/managers/schedule_policy.py +158 -18
- sglang/srt/managers/scheduler.py +665 -620
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
- sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
- sglang/srt/managers/tokenizer_manager.py +462 -226
- sglang/srt/managers/tp_worker.py +217 -156
- sglang/srt/managers/utils.py +79 -47
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +42 -28
- sglang/srt/mem_cache/base_prefix_cache.py +3 -3
- sglang/srt/mem_cache/chunk_cache.py +20 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +38 -0
- sglang/srt/mem_cache/hicache_storage.py +44 -2
- sglang/srt/mem_cache/hiradix_cache.py +134 -34
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +602 -208
- sglang/srt/mem_cache/memory_pool_host.py +134 -183
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +263 -78
- sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +115 -58
- sglang/srt/metrics/collector.py +113 -120
- sglang/srt/metrics/func_timer.py +3 -8
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +81 -36
- sglang/srt/model_executor/forward_batch_info.py +40 -50
- sglang/srt/model_executor/model_runner.py +507 -319
- sglang/srt/model_executor/npu_graph_runner.py +11 -5
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +438 -37
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +200 -27
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +40 -56
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +25 -4
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +793 -235
- sglang/srt/models/dots_ocr.py +171 -0
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +570 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -3
- sglang/srt/models/glm4_moe.py +17 -40
- sglang/srt/models/glm4_moe_nextn.py +4 -4
- sglang/srt/models/glm4v.py +3 -2
- sglang/srt/models/glm4v_moe.py +6 -6
- sglang/srt/models/gpt_oss.py +12 -35
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +4 -2
- sglang/srt/models/llama.py +6 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +6 -23
- sglang/srt/models/longcat_flash_nextn.py +4 -15
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +27 -6
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +5 -5
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +70 -4
- sglang/srt/models/qwen2_vl.py +6 -3
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +50 -38
- sglang/srt/models/qwen3_next.py +43 -21
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +791 -0
- sglang/srt/models/qwen3_vl_moe.py +343 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +268 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +61 -0
- sglang/srt/multimodal/processors/base_processor.py +21 -9
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +2 -4
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +20 -10
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +83 -17
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +36 -23
- sglang/srt/sampling/sampling_params.py +75 -0
- sglang/srt/server_args.py +1300 -338
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +161 -0
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
- sglang/srt/speculative/eagle_info.py +786 -0
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +113 -1270
- sglang/srt/speculative/eagle_worker.py +120 -285
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/ngram_info.py +433 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +49 -0
- sglang/srt/speculative/spec_utils.py +641 -0
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +35 -18
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/{utils.py → utils/common.py} +583 -113
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +120 -11
- sglang/test/runners.py +3 -1
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +8 -2
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +3 -4
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +430 -0
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +93 -1
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +432 -16
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
- sglang/srt/entrypoints/grpc_request_manager.py +0 -580
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/metrics/collector.py
CHANGED
|
@@ -14,10 +14,10 @@
|
|
|
14
14
|
"""Utilities for Prometheus Metrics Collection."""
|
|
15
15
|
import time
|
|
16
16
|
from dataclasses import dataclass, field
|
|
17
|
-
from enum import Enum
|
|
18
17
|
from typing import Dict, List, Optional, Union
|
|
19
18
|
|
|
20
|
-
from sglang.srt.
|
|
19
|
+
from sglang.srt.disaggregation.utils import DisaggregationMode
|
|
20
|
+
from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
|
|
21
21
|
from sglang.srt.server_args import ServerArgs
|
|
22
22
|
from sglang.srt.utils import get_bool_env_var
|
|
23
23
|
|
|
@@ -34,6 +34,7 @@ class TimeStats:
|
|
|
34
34
|
Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
+
disagg_mode: DisaggregationMode = DisaggregationMode.NULL
|
|
37
38
|
lb_entry_time: float = 0.0
|
|
38
39
|
wait_queue_entry_time: float = 0.0
|
|
39
40
|
forward_entry_time: float = 0.0
|
|
@@ -43,20 +44,11 @@ class TimeStats:
|
|
|
43
44
|
decode_prealloc_queue_entry_time: float = 0.0
|
|
44
45
|
decode_transfer_queue_entry_time: float = 0.0
|
|
45
46
|
|
|
46
|
-
class RequestType(Enum):
|
|
47
|
-
UNIFIED = "unified"
|
|
48
|
-
PREFILL = "prefill"
|
|
49
|
-
DECODE = "decode"
|
|
50
|
-
INVALID = "invalid"
|
|
51
|
-
|
|
52
47
|
def get_queueing_time(self) -> float:
|
|
53
48
|
return self.forward_entry_time - self.wait_queue_entry_time
|
|
54
49
|
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
_type = self.get_type()
|
|
58
|
-
|
|
59
|
-
if _type == self.RequestType.UNIFIED:
|
|
50
|
+
def convert_to_duration(self) -> str:
|
|
51
|
+
if self.disagg_mode == DisaggregationMode.NULL:
|
|
60
52
|
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
|
61
53
|
forward_duration = self.completion_time - self.forward_entry_time
|
|
62
54
|
|
|
@@ -65,30 +57,28 @@ class TimeStats:
|
|
|
65
57
|
queue_duration >= 0 and forward_duration >= 0
|
|
66
58
|
), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
|
67
59
|
|
|
68
|
-
return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
|
|
69
|
-
elif
|
|
60
|
+
return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time:.3f}"
|
|
61
|
+
elif self.disagg_mode == DisaggregationMode.PREFILL:
|
|
70
62
|
bootstrap_duration = (
|
|
71
63
|
self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
|
|
72
64
|
)
|
|
73
|
-
|
|
74
65
|
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
|
75
|
-
|
|
76
66
|
forward_duration = self.completion_time - self.forward_entry_time
|
|
77
67
|
|
|
78
68
|
if SGLANG_TEST_REQUEST_TIME_STATS:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
69
|
+
if self.wait_queue_entry_time > 0:
|
|
70
|
+
assert (
|
|
71
|
+
bootstrap_duration >= 0
|
|
72
|
+
and queue_duration >= 0
|
|
73
|
+
and forward_duration >= 0
|
|
74
|
+
), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
|
75
|
+
|
|
76
|
+
return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time:.3f}"
|
|
77
|
+
elif self.disagg_mode == DisaggregationMode.DECODE:
|
|
87
78
|
prealloc_duration = (
|
|
88
79
|
self.decode_transfer_queue_entry_time
|
|
89
80
|
- self.decode_prealloc_queue_entry_time
|
|
90
81
|
)
|
|
91
|
-
|
|
92
82
|
transfer_duration = (
|
|
93
83
|
self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
|
|
94
84
|
)
|
|
@@ -96,42 +86,30 @@ class TimeStats:
|
|
|
96
86
|
forward_duration = self.completion_time - self.forward_entry_time
|
|
97
87
|
|
|
98
88
|
if SGLANG_TEST_REQUEST_TIME_STATS:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
89
|
+
if self.wait_queue_entry_time > 0:
|
|
90
|
+
assert (
|
|
91
|
+
prealloc_duration >= 0
|
|
92
|
+
and transfer_duration >= 0
|
|
93
|
+
and queue_duration >= 0
|
|
94
|
+
and forward_duration >= 0
|
|
95
|
+
), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0. {self=}"
|
|
96
|
+
|
|
97
|
+
return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time:.3f}"
|
|
107
98
|
else:
|
|
108
|
-
return "
|
|
99
|
+
return "Unknown Time Stats"
|
|
109
100
|
|
|
110
101
|
def format_duration(self, duration: float) -> str:
|
|
111
102
|
return f"{duration * 1e3:.2f}ms"
|
|
112
103
|
|
|
113
|
-
def
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
):
|
|
121
|
-
return self.RequestType.UNIFIED
|
|
122
|
-
elif (
|
|
123
|
-
self.prefill_bootstrap_queue_entry_time > 0.0
|
|
124
|
-
and self.prefill_transfer_queue_entry_time > 0.0
|
|
125
|
-
):
|
|
126
|
-
return self.RequestType.PREFILL
|
|
127
|
-
elif (
|
|
128
|
-
self.decode_prealloc_queue_entry_time > 0.0
|
|
129
|
-
and self.decode_transfer_queue_entry_time > 0.0
|
|
130
|
-
and self.wait_queue_entry_time > 0.0
|
|
131
|
-
):
|
|
132
|
-
return self.RequestType.DECODE
|
|
104
|
+
def disagg_mode_str(self) -> str:
|
|
105
|
+
if self.disagg_mode == DisaggregationMode.NULL:
|
|
106
|
+
return "unified"
|
|
107
|
+
elif self.disagg_mode == DisaggregationMode.DECODE:
|
|
108
|
+
return "decode"
|
|
109
|
+
elif self.disagg_mode == DisaggregationMode.PREFILL:
|
|
110
|
+
return "prefill"
|
|
133
111
|
else:
|
|
134
|
-
return
|
|
112
|
+
return "unknown"
|
|
135
113
|
|
|
136
114
|
|
|
137
115
|
@dataclass
|
|
@@ -140,16 +118,21 @@ class SchedulerStats:
|
|
|
140
118
|
num_running_reqs: int = 0
|
|
141
119
|
num_used_tokens: int = 0
|
|
142
120
|
token_usage: float = 0.0
|
|
121
|
+
pending_prealloc_token_usage: float = 0.0
|
|
143
122
|
swa_token_usage: float = 0.0
|
|
144
123
|
gen_throughput: float = 0.0
|
|
145
124
|
num_queue_reqs: int = 0
|
|
146
125
|
num_grammar_queue_reqs: int = 0
|
|
147
126
|
num_running_reqs_offline_batch: int = 0
|
|
148
|
-
avg_request_queue_latency: float = 0.0
|
|
149
127
|
cache_hit_rate: float = 0.0
|
|
150
128
|
|
|
151
129
|
# Speculative decoding
|
|
152
130
|
spec_accept_length: float = 0.0
|
|
131
|
+
spec_accept_rate: float = 0.0
|
|
132
|
+
|
|
133
|
+
# Retract
|
|
134
|
+
num_retracted_reqs: int = 0
|
|
135
|
+
num_paused_reqs: int = 0
|
|
153
136
|
|
|
154
137
|
# PD disaggregation
|
|
155
138
|
num_prefill_prealloc_queue_reqs: int = 0
|
|
@@ -159,11 +142,6 @@ class SchedulerStats:
|
|
|
159
142
|
kv_transfer_speed_gb_s: float = 0.0
|
|
160
143
|
kv_transfer_latency_ms: float = 0.0
|
|
161
144
|
|
|
162
|
-
# Retract
|
|
163
|
-
total_retracted_reqs: int = 0
|
|
164
|
-
num_retracted_reqs: int = 0
|
|
165
|
-
num_paused_reqs: int = 0
|
|
166
|
-
|
|
167
145
|
# Utilization
|
|
168
146
|
utilization: float = 0.0
|
|
169
147
|
max_running_requests_under_SLO: Optional[int] = None
|
|
@@ -172,6 +150,9 @@ class SchedulerStats:
|
|
|
172
150
|
engine_startup_time: float = 0.0
|
|
173
151
|
engine_load_weights_time: float = 0.0
|
|
174
152
|
|
|
153
|
+
# CUDA graph
|
|
154
|
+
is_cuda_graph: float = 0.0
|
|
155
|
+
|
|
175
156
|
|
|
176
157
|
class SchedulerMetricsCollector:
|
|
177
158
|
|
|
@@ -200,6 +181,12 @@ class SchedulerMetricsCollector:
|
|
|
200
181
|
labelnames=labels.keys(),
|
|
201
182
|
multiprocess_mode="mostrecent",
|
|
202
183
|
)
|
|
184
|
+
self.pending_prealloc_token_usage = Gauge(
|
|
185
|
+
name="sglang:pending_prealloc_token_usage",
|
|
186
|
+
documentation="The token usage for pending preallocated tokens (not preallocated yet).",
|
|
187
|
+
labelnames=labels.keys(),
|
|
188
|
+
multiprocess_mode="mostrecent",
|
|
189
|
+
)
|
|
203
190
|
self.swa_token_usage = Gauge(
|
|
204
191
|
name="sglang:swa_token_usage",
|
|
205
192
|
documentation="The token usage for SWA layers.",
|
|
@@ -230,12 +217,6 @@ class SchedulerMetricsCollector:
|
|
|
230
217
|
labelnames=labels.keys(),
|
|
231
218
|
multiprocess_mode="mostrecent",
|
|
232
219
|
)
|
|
233
|
-
self.avg_request_queue_latency = Gauge(
|
|
234
|
-
name="sglang:avg_request_queue_latency",
|
|
235
|
-
documentation="The average request queue latency for the last batch of requests in seconds.",
|
|
236
|
-
labelnames=labels.keys(),
|
|
237
|
-
multiprocess_mode="mostrecent",
|
|
238
|
-
)
|
|
239
220
|
self.cache_hit_rate = Gauge(
|
|
240
221
|
name="sglang:cache_hit_rate",
|
|
241
222
|
documentation="The prefix cache hit rate.",
|
|
@@ -250,6 +231,24 @@ class SchedulerMetricsCollector:
|
|
|
250
231
|
labelnames=labels.keys(),
|
|
251
232
|
multiprocess_mode="mostrecent",
|
|
252
233
|
)
|
|
234
|
+
self.spec_accept_rate = Gauge(
|
|
235
|
+
name="sglang:spec_accept_rate",
|
|
236
|
+
documentation="The average acceptance rate of speculative decoding (`accepted tokens / total draft tokens` in batch).",
|
|
237
|
+
labelnames=labels.keys(),
|
|
238
|
+
multiprocess_mode="mostrecent",
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Retract
|
|
242
|
+
self.num_retracted_reqs = Gauge(
|
|
243
|
+
name="sglang:num_retracted_reqs",
|
|
244
|
+
documentation="The number of retracted requests.",
|
|
245
|
+
labelnames=labels.keys(),
|
|
246
|
+
)
|
|
247
|
+
self.num_paused_reqs = Gauge(
|
|
248
|
+
name="sglang:num_paused_reqs",
|
|
249
|
+
documentation="The number of paused requests by async weight sync.",
|
|
250
|
+
labelnames=labels.keys(),
|
|
251
|
+
)
|
|
253
252
|
|
|
254
253
|
# PD disaggregation
|
|
255
254
|
self.num_prefill_prealloc_queue_reqs = Gauge(
|
|
@@ -299,24 +298,6 @@ class SchedulerMetricsCollector:
|
|
|
299
298
|
multiprocess_mode="mostrecent",
|
|
300
299
|
)
|
|
301
300
|
|
|
302
|
-
# Retract
|
|
303
|
-
self.total_retracted_reqs = Gauge(
|
|
304
|
-
name="sglang:total_retracted_reqs",
|
|
305
|
-
documentation="The total number of retracted requests due to kvcache full.",
|
|
306
|
-
labelnames=labels.keys(),
|
|
307
|
-
multiprocess_mode="mostrecent",
|
|
308
|
-
)
|
|
309
|
-
self.num_retracted_reqs = Gauge(
|
|
310
|
-
name="sglang:num_retracted_reqs",
|
|
311
|
-
documentation="The number of retracted requests.",
|
|
312
|
-
labelnames=labels.keys(),
|
|
313
|
-
)
|
|
314
|
-
self.num_paused_reqs = Gauge(
|
|
315
|
-
name="sglang:num_paused_reqs",
|
|
316
|
-
documentation="The number of paused requests by async weight sync.",
|
|
317
|
-
labelnames=labels.keys(),
|
|
318
|
-
)
|
|
319
|
-
|
|
320
301
|
# Utilization
|
|
321
302
|
self.utilization = Gauge(
|
|
322
303
|
name="sglang:utilization",
|
|
@@ -347,7 +328,7 @@ class SchedulerMetricsCollector:
|
|
|
347
328
|
|
|
348
329
|
# Additional queueing time histogram
|
|
349
330
|
self.queue_time = Histogram(
|
|
350
|
-
name="sglang:
|
|
331
|
+
name="sglang:queue_time_seconds",
|
|
351
332
|
documentation="Histogram of queueing time in seconds.",
|
|
352
333
|
labelnames=labels.keys(),
|
|
353
334
|
buckets=[
|
|
@@ -513,11 +494,26 @@ class SchedulerMetricsCollector:
|
|
|
513
494
|
buckets=tree_traversal_time_buckets,
|
|
514
495
|
)
|
|
515
496
|
|
|
497
|
+
self.per_stage_req_latency_seconds = Histogram(
|
|
498
|
+
name="sglang:per_stage_req_latency_seconds",
|
|
499
|
+
documentation="The latency of each stage of requests.",
|
|
500
|
+
# captures latency in range [1ms - ~1191s]
|
|
501
|
+
buckets=exponential_buckets(start=0.001, width=1.62, length=30),
|
|
502
|
+
labelnames=list(labels.keys()) + ["stage"],
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
self.is_cuda_graph = Gauge(
|
|
506
|
+
name="sglang:is_cuda_graph",
|
|
507
|
+
documentation="Whether the batch is using CUDA graph.",
|
|
508
|
+
labelnames=labels.keys(),
|
|
509
|
+
multiprocess_mode="mostrecent",
|
|
510
|
+
)
|
|
511
|
+
|
|
516
512
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
|
517
513
|
# Convenience function for logging to gauge.
|
|
518
514
|
gauge.labels(**self.labels).set(data)
|
|
519
515
|
|
|
520
|
-
def
|
|
516
|
+
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
|
|
521
517
|
histogram.labels(**self.labels).observe(data)
|
|
522
518
|
|
|
523
519
|
def increment_bootstrap_failed_reqs(self) -> None:
|
|
@@ -526,10 +522,20 @@ class SchedulerMetricsCollector:
|
|
|
526
522
|
def increment_transfer_failed_reqs(self) -> None:
|
|
527
523
|
self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
|
|
528
524
|
|
|
525
|
+
def observe_per_stage_req_latency(self, stage: str, latency: float) -> None:
|
|
526
|
+
labels_with_stage = {**self.labels, "stage": stage}
|
|
527
|
+
self.per_stage_req_latency_seconds.labels(**labels_with_stage).observe(latency)
|
|
528
|
+
|
|
529
|
+
def observe_queue_time(self, latency: float) -> None:
|
|
530
|
+
self._log_histogram(self.queue_time, latency)
|
|
531
|
+
|
|
529
532
|
def log_stats(self, stats: SchedulerStats) -> None:
|
|
530
533
|
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
|
|
531
534
|
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
|
|
532
535
|
self._log_gauge(self.token_usage, stats.token_usage)
|
|
536
|
+
self._log_gauge(
|
|
537
|
+
self.pending_prealloc_token_usage, stats.pending_prealloc_token_usage
|
|
538
|
+
)
|
|
533
539
|
self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
|
|
534
540
|
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
|
535
541
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
|
@@ -538,10 +544,10 @@ class SchedulerMetricsCollector:
|
|
|
538
544
|
self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
|
|
539
545
|
)
|
|
540
546
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
|
541
|
-
self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
|
|
542
547
|
|
|
543
548
|
# Speculative decoding
|
|
544
549
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
|
550
|
+
self._log_gauge(self.spec_accept_rate, stats.spec_accept_rate)
|
|
545
551
|
|
|
546
552
|
# PD disaggregation
|
|
547
553
|
self._log_gauge(
|
|
@@ -560,7 +566,6 @@ class SchedulerMetricsCollector:
|
|
|
560
566
|
self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
|
|
561
567
|
|
|
562
568
|
# Retract
|
|
563
|
-
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
|
564
569
|
self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
|
|
565
570
|
self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
|
|
566
571
|
|
|
@@ -579,24 +584,27 @@ class SchedulerMetricsCollector:
|
|
|
579
584
|
self.engine_load_weights_time, stats.engine_load_weights_time
|
|
580
585
|
)
|
|
581
586
|
|
|
587
|
+
# CUDA graph
|
|
588
|
+
self._log_gauge(self.is_cuda_graph, stats.is_cuda_graph)
|
|
589
|
+
|
|
582
590
|
self.last_log_time = time.perf_counter()
|
|
583
591
|
|
|
584
592
|
def log_grammar_stats(self, grammar_stats) -> None:
|
|
585
593
|
# Duck-typed GrammarStats to avoid cross-package dependency
|
|
586
594
|
if getattr(grammar_stats, "compilation_time", None) is not None:
|
|
587
|
-
self.
|
|
595
|
+
self._log_histogram(
|
|
588
596
|
self.grammar_compilation_time, grammar_stats.compilation_time
|
|
589
597
|
)
|
|
590
598
|
if getattr(grammar_stats, "schema_count", None) is not None:
|
|
591
|
-
self.
|
|
599
|
+
self._log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
|
|
592
600
|
if getattr(grammar_stats, "ebnf_size", None) is not None:
|
|
593
|
-
self.
|
|
601
|
+
self._log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
|
|
594
602
|
tree_times = getattr(grammar_stats, "tree_traversal_time", None)
|
|
595
603
|
if tree_times:
|
|
596
604
|
max_time = max(tree_times)
|
|
597
605
|
avg_time = sum(tree_times) / len(tree_times)
|
|
598
|
-
self.
|
|
599
|
-
self.
|
|
606
|
+
self._log_histogram(self.grammar_tree_traversal_time_max, max_time)
|
|
607
|
+
self._log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
|
|
600
608
|
if getattr(grammar_stats, "is_cache_hit", False):
|
|
601
609
|
self.num_grammar_cache_hit.labels(**self.labels).inc(1)
|
|
602
610
|
if getattr(grammar_stats, "is_grammar_aborted", False):
|
|
@@ -702,7 +710,7 @@ class TokenizerMetricsCollector:
|
|
|
702
710
|
)
|
|
703
711
|
|
|
704
712
|
self.num_aborted_requests_total = Counter(
|
|
705
|
-
name="sglang:
|
|
713
|
+
name="sglang:num_aborted_requests_total",
|
|
706
714
|
documentation="Number of requests aborted.",
|
|
707
715
|
labelnames=labels.keys(),
|
|
708
716
|
)
|
|
@@ -789,7 +797,7 @@ class TokenizerMetricsCollector:
|
|
|
789
797
|
buckets=bucket_time_to_first_token,
|
|
790
798
|
)
|
|
791
799
|
|
|
792
|
-
self.
|
|
800
|
+
self.histogram_inter_token_latency = Histogram(
|
|
793
801
|
name="sglang:inter_token_latency_seconds",
|
|
794
802
|
documentation="Histogram of inter-token latency in seconds.",
|
|
795
803
|
labelnames=labels.keys(),
|
|
@@ -803,14 +811,6 @@ class TokenizerMetricsCollector:
|
|
|
803
811
|
buckets=bucket_e2e_request_latency,
|
|
804
812
|
)
|
|
805
813
|
|
|
806
|
-
# Offline batch specific TTFB histogram
|
|
807
|
-
self.histogram_time_to_first_token_offline_batch = Histogram(
|
|
808
|
-
name="sglang:time_to_first_token_seconds_offline_batch",
|
|
809
|
-
documentation="Histogram of time to first token in seconds for offline batch requests.",
|
|
810
|
-
labelnames=labels.keys(),
|
|
811
|
-
buckets=bucket_time_to_first_token,
|
|
812
|
-
)
|
|
813
|
-
|
|
814
814
|
def observe_one_finished_request(
|
|
815
815
|
self,
|
|
816
816
|
labels: Dict[str, str],
|
|
@@ -834,26 +834,19 @@ class TokenizerMetricsCollector:
|
|
|
834
834
|
float(generation_tokens)
|
|
835
835
|
)
|
|
836
836
|
|
|
837
|
-
def observe_time_to_first_token(
|
|
838
|
-
self
|
|
839
|
-
):
|
|
840
|
-
if type == "batch":
|
|
841
|
-
self.histogram_time_to_first_token_offline_batch.labels(**labels).observe(
|
|
842
|
-
value
|
|
843
|
-
)
|
|
844
|
-
else:
|
|
845
|
-
self.histogram_time_to_first_token.labels(**labels).observe(value)
|
|
837
|
+
def observe_time_to_first_token(self, labels: Dict[str, str], value: float):
|
|
838
|
+
self.histogram_time_to_first_token.labels(**labels).observe(value)
|
|
846
839
|
|
|
847
840
|
def check_time_to_first_token_straggler(self, value: float) -> bool:
|
|
848
841
|
his = self.histogram_time_to_first_token.labels(**self.labels)
|
|
849
842
|
total_observations = sum(bucket._value for bucket in his._buckets)
|
|
850
|
-
if total_observations <
|
|
843
|
+
if total_observations < 1000:
|
|
851
844
|
return False
|
|
852
|
-
|
|
845
|
+
p999_threshold = total_observations * 0.999
|
|
853
846
|
cumulative_count = 0
|
|
854
847
|
for i, bucket in enumerate(his._buckets):
|
|
855
848
|
cumulative_count += bucket._value
|
|
856
|
-
if cumulative_count >
|
|
849
|
+
if cumulative_count > p999_threshold:
|
|
857
850
|
return value >= his._upper_bounds[i]
|
|
858
851
|
return False
|
|
859
852
|
|
|
@@ -864,7 +857,7 @@ class TokenizerMetricsCollector:
|
|
|
864
857
|
|
|
865
858
|
# A faster version of the Histogram::observe which observes multiple values at the same time.
|
|
866
859
|
# reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
|
|
867
|
-
his = self.
|
|
860
|
+
his = self.histogram_inter_token_latency.labels(**labels)
|
|
868
861
|
his._sum.inc(internval)
|
|
869
862
|
|
|
870
863
|
for i, bound in enumerate(his._upper_bounds):
|
|
@@ -872,8 +865,8 @@ class TokenizerMetricsCollector:
|
|
|
872
865
|
his._buckets[i].inc(num_new_tokens)
|
|
873
866
|
break
|
|
874
867
|
|
|
875
|
-
def observe_one_aborted_request(self):
|
|
876
|
-
self.num_aborted_requests_total.labels(**
|
|
868
|
+
def observe_one_aborted_request(self, labels: Dict[str, str]):
|
|
869
|
+
self.num_aborted_requests_total.labels(**labels).inc(1)
|
|
877
870
|
|
|
878
871
|
|
|
879
872
|
@dataclass
|
sglang/srt/metrics/func_timer.py
CHANGED
|
@@ -18,7 +18,9 @@ Records the latency of some functions
|
|
|
18
18
|
import asyncio
|
|
19
19
|
import time
|
|
20
20
|
from functools import wraps
|
|
21
|
-
from typing import Any, Callable,
|
|
21
|
+
from typing import Any, Callable, Optional
|
|
22
|
+
|
|
23
|
+
from sglang.srt.metrics.utils import exponential_buckets
|
|
22
24
|
|
|
23
25
|
enable_metrics = False
|
|
24
26
|
|
|
@@ -42,13 +44,6 @@ def enable_func_timer():
|
|
|
42
44
|
FUNC_LATENCY = None
|
|
43
45
|
|
|
44
46
|
|
|
45
|
-
def exponential_buckets(start: float, width: float, length: int) -> List[float]:
|
|
46
|
-
buckets = []
|
|
47
|
-
for i in range(length):
|
|
48
|
-
buckets.append(start * (width**i))
|
|
49
|
-
return buckets
|
|
50
|
-
|
|
51
|
-
|
|
52
47
|
def time_func_latency(
|
|
53
48
|
func: Callable = None, name: Optional[str] = None
|
|
54
49
|
) -> Callable[..., Any]:
|
sglang/srt/metrics/utils.py
CHANGED
|
@@ -44,5 +44,12 @@ def generate_buckets(
|
|
|
44
44
|
return two_sides_exponential_buckets(float(middle), float(base), int(count))
|
|
45
45
|
if rule == "default":
|
|
46
46
|
return sorted(set(default_buckets))
|
|
47
|
-
assert rule == "
|
|
47
|
+
assert rule == "custom"
|
|
48
48
|
return sorted(set([float(x) for x in buckets_rule[1:]]))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def exponential_buckets(start: float, width: float, length: int) -> List[float]:
|
|
52
|
+
buckets = []
|
|
53
|
+
for i in range(length):
|
|
54
|
+
buckets.append(start * (width**i))
|
|
55
|
+
return buckets
|
|
@@ -34,7 +34,6 @@ from sglang.srt.model_executor.forward_batch_info import (
|
|
|
34
34
|
ForwardMode,
|
|
35
35
|
PPProxyTensors,
|
|
36
36
|
)
|
|
37
|
-
from sglang.srt.patch_torch import monkey_patch_torch_compile
|
|
38
37
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
|
39
38
|
from sglang.srt.utils import (
|
|
40
39
|
log_info_on_rank0,
|
|
@@ -43,6 +42,7 @@ from sglang.srt.utils import (
|
|
|
43
42
|
require_mlp_sync,
|
|
44
43
|
require_mlp_tp_gather,
|
|
45
44
|
)
|
|
45
|
+
from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
|
|
46
46
|
|
|
47
47
|
logger = logging.getLogger(__name__)
|
|
48
48
|
|
|
@@ -607,7 +607,7 @@ class CPUGraphRunner:
|
|
|
607
607
|
def get_spec_info(self, num_tokens: int):
|
|
608
608
|
spec_info = None
|
|
609
609
|
if self.model_runner.spec_algorithm.is_eagle():
|
|
610
|
-
from sglang.srt.speculative.
|
|
610
|
+
from sglang.srt.speculative.eagle_info import EagleVerifyInput
|
|
611
611
|
|
|
612
612
|
if self.model_runner.is_draft_worker:
|
|
613
613
|
raise RuntimeError("This should not happen.")
|