PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

sglang/bench_one_batch.py +54 -37
sglang/bench_one_batch_server.py +340 -34
sglang/bench_serving.py +340 -159
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +20 -3
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +309 -0
sglang/srt/configs/load_config.py +33 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +284 -118
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +576 -0
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +6 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +26 -15
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +268 -98
sglang/srt/disaggregation/decode.py +172 -39
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +203 -555
sglang/srt/disaggregation/nixl/conn.py +217 -63
sglang/srt/disaggregation/prefill.py +113 -270
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +203 -97
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +85 -65
sglang/srt/entrypoints/grpc_server.py +632 -305
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +169 -17
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +327 -34
sglang/srt/entrypoints/openai/serving_base.py +74 -8
sglang/srt/entrypoints/openai/serving_chat.py +202 -118
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +20 -4
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +47 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +323 -0
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +21 -16
sglang/srt/function_call/glm4_moe_detector.py +4 -8
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +61 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +98 -7
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/grpc_request_manager.py +915 -0
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
sglang/srt/layers/activation.py +11 -7
sglang/srt/layers/attention/aiter_backend.py +17 -18
sglang/srt/layers/attention/ascend_backend.py +125 -10
sglang/srt/layers/attention/attention_registry.py +226 -0
sglang/srt/layers/attention/base_attn_backend.py +32 -4
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +52 -15
sglang/srt/layers/attention/flashinfer_backend.py +357 -212
sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
sglang/srt/layers/attention/flashmla_backend.py +9 -7
sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
sglang/srt/layers/attention/mamba/mamba.py +514 -1
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +23 -0
sglang/srt/layers/attention/nsa_backend.py +1201 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +249 -42
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +61 -3
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +19 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +28 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +47 -15
sglang/srt/layers/linear.py +30 -5
sglang/srt/layers/logits_processor.py +161 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
sglang/srt/layers/moe/ep_moe/layer.py +243 -448
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +27 -1
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +86 -20
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +43 -15
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +141 -81
sglang/srt/layers/quantization/mxfp4.py +17 -34
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -24
sglang/srt/layers/quantization/w8a8_int8.py +45 -27
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +750 -46
sglang/srt/layers/sampler.py +84 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +23 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +9 -4
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +33 -7
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +41 -17
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +83 -152
sglang/srt/managers/data_parallel_controller.py +156 -87
sglang/srt/managers/detokenizer_manager.py +51 -24
sglang/srt/managers/io_struct.py +223 -129
sglang/srt/managers/mm_utils.py +49 -10
sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +130 -0
sglang/srt/managers/schedule_batch.py +340 -529
sglang/srt/managers/schedule_policy.py +158 -18
sglang/srt/managers/scheduler.py +665 -620
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
sglang/srt/managers/tokenizer_manager.py +462 -226
sglang/srt/managers/tp_worker.py +217 -156
sglang/srt/managers/utils.py +79 -47
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +42 -28
sglang/srt/mem_cache/base_prefix_cache.py +3 -3
sglang/srt/mem_cache/chunk_cache.py +20 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +38 -0
sglang/srt/mem_cache/hicache_storage.py +44 -2
sglang/srt/mem_cache/hiradix_cache.py +134 -34
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +602 -208
sglang/srt/mem_cache/memory_pool_host.py +134 -183
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +263 -78
sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +115 -58
sglang/srt/metrics/collector.py +113 -120
sglang/srt/metrics/func_timer.py +3 -8
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +81 -36
sglang/srt/model_executor/forward_batch_info.py +40 -50
sglang/srt/model_executor/model_runner.py +507 -319
sglang/srt/model_executor/npu_graph_runner.py +11 -5
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +438 -37
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +200 -27
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +40 -56
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +25 -4
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +793 -235
sglang/srt/models/dots_ocr.py +171 -0
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +570 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -3
sglang/srt/models/glm4_moe.py +17 -40
sglang/srt/models/glm4_moe_nextn.py +4 -4
sglang/srt/models/glm4v.py +3 -2
sglang/srt/models/glm4v_moe.py +6 -6
sglang/srt/models/gpt_oss.py +12 -35
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +4 -2
sglang/srt/models/llama.py +6 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +6 -23
sglang/srt/models/longcat_flash_nextn.py +4 -15
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +27 -6
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +5 -5
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +70 -4
sglang/srt/models/qwen2_vl.py +6 -3
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +50 -38
sglang/srt/models/qwen3_next.py +43 -21
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +791 -0
sglang/srt/models/qwen3_vl_moe.py +343 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +268 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +61 -0
sglang/srt/multimodal/processors/base_processor.py +21 -9
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +2 -4
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +20 -10
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +83 -17
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +36 -23
sglang/srt/sampling/sampling_params.py +75 -0
sglang/srt/server_args.py +1300 -338
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +161 -0
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
sglang/srt/speculative/eagle_info.py +786 -0
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +113 -1270
sglang/srt/speculative/eagle_worker.py +120 -285
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/ngram_info.py +433 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +49 -0
sglang/srt/speculative/spec_utils.py +641 -0
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +35 -18
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/{utils.py → utils/common.py} +583 -113
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/profile_merger.py +199 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/get_logits_ut.py +57 -0
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +120 -11
sglang/test/runners.py +3 -1
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +8 -2
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +3 -4
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +430 -0
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +93 -1
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +432 -16
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
sglang/srt/entrypoints/grpc_request_manager.py +0 -580
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/srt/metrics/collector.py CHANGED Viewed

@@ -14,10 +14,10 @@
 """Utilities for Prometheus Metrics Collection."""
 import time
 from dataclasses import dataclass, field
-from enum import Enum
 from typing import Dict, List, Optional, Union
-from sglang.srt.metrics.utils import generate_buckets
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_bool_env_var
@@ -34,6 +34,7 @@ class TimeStats:
     Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
     """
+    disagg_mode: DisaggregationMode = DisaggregationMode.NULL
     lb_entry_time: float = 0.0
     wait_queue_entry_time: float = 0.0
     forward_entry_time: float = 0.0
@@ -43,20 +44,11 @@ class TimeStats:
     decode_prealloc_queue_entry_time: float = 0.0
     decode_transfer_queue_entry_time: float = 0.0
-    class RequestType(Enum):
-        UNIFIED = "unified"
-        PREFILL = "prefill"
-        DECODE = "decode"
-        INVALID = "invalid"
     def get_queueing_time(self) -> float:
         return self.forward_entry_time - self.wait_queue_entry_time
-    def __str__(self) -> str:
-        # if unified
-        _type = self.get_type()
-        if _type == self.RequestType.UNIFIED:
+    def convert_to_duration(self) -> str:
+        if self.disagg_mode == DisaggregationMode.NULL:
             queue_duration = self.forward_entry_time - self.wait_queue_entry_time
             forward_duration = self.completion_time - self.forward_entry_time
@@ -65,30 +57,28 @@ class TimeStats:
                     queue_duration >= 0 and forward_duration >= 0
                 ), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
-            return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
-        elif _type == self.RequestType.PREFILL:
+            return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time:.3f}"
+        elif self.disagg_mode == DisaggregationMode.PREFILL:
             bootstrap_duration = (
                 self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
             )
             queue_duration = self.forward_entry_time - self.wait_queue_entry_time
             forward_duration = self.completion_time - self.forward_entry_time
             if SGLANG_TEST_REQUEST_TIME_STATS:
-                assert (
-                    bootstrap_duration >= 0
-                    and queue_duration >= 0
-                    and forward_duration >= 0
-                ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
-            return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time}"
-        # if decode
-        elif _type == self.RequestType.DECODE:
+                if self.wait_queue_entry_time > 0:
+                    assert (
+                        bootstrap_duration >= 0
+                        and queue_duration >= 0
+                        and forward_duration >= 0
+                    ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
+            return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time:.3f}"
+        elif self.disagg_mode == DisaggregationMode.DECODE:
             prealloc_duration = (
                 self.decode_transfer_queue_entry_time
                 - self.decode_prealloc_queue_entry_time
             )
             transfer_duration = (
                 self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
             )
@@ -96,42 +86,30 @@ class TimeStats:
             forward_duration = self.completion_time - self.forward_entry_time
             if SGLANG_TEST_REQUEST_TIME_STATS:
-                assert (
-                    prealloc_duration >= 0
-                    and transfer_duration >= 0
-                    and queue_duration >= 0
-                    and forward_duration >= 0
-                ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
-            return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time}"
+                if self.wait_queue_entry_time > 0:
+                    assert (
+                        prealloc_duration >= 0
+                        and transfer_duration >= 0
+                        and queue_duration >= 0
+                        and forward_duration >= 0
+                    ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0. {self=}"
+            return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time:.3f}"
         else:
-            return "Invalid Time Stats"
+            return "Unknown Time Stats"
     def format_duration(self, duration: float) -> str:
         return f"{duration * 1e3:.2f}ms"
-    def get_type(self) -> RequestType:
-        """Determine the type of request based on timestamp values."""
-        if (
-            self.prefill_bootstrap_queue_entry_time == 0.0
-            and self.prefill_transfer_queue_entry_time == 0.0
-            and self.decode_prealloc_queue_entry_time == 0.0
-            and self.decode_transfer_queue_entry_time == 0.0
-        ):
-            return self.RequestType.UNIFIED
-        elif (
-            self.prefill_bootstrap_queue_entry_time > 0.0
-            and self.prefill_transfer_queue_entry_time > 0.0
-        ):
-            return self.RequestType.PREFILL
-        elif (
-            self.decode_prealloc_queue_entry_time > 0.0
-            and self.decode_transfer_queue_entry_time > 0.0
-            and self.wait_queue_entry_time > 0.0
-        ):
-            return self.RequestType.DECODE
+    def disagg_mode_str(self) -> str:
+        if self.disagg_mode == DisaggregationMode.NULL:
+            return "unified"
+        elif self.disagg_mode == DisaggregationMode.DECODE:
+            return "decode"
+        elif self.disagg_mode == DisaggregationMode.PREFILL:
+            return "prefill"
         else:
-            return self.RequestType.INVALID
+            return "unknown"
 @dataclass
@@ -140,16 +118,21 @@ class SchedulerStats:
     num_running_reqs: int = 0
     num_used_tokens: int = 0
     token_usage: float = 0.0
+    pending_prealloc_token_usage: float = 0.0
     swa_token_usage: float = 0.0
     gen_throughput: float = 0.0
     num_queue_reqs: int = 0
     num_grammar_queue_reqs: int = 0
     num_running_reqs_offline_batch: int = 0
-    avg_request_queue_latency: float = 0.0
     cache_hit_rate: float = 0.0
     # Speculative decoding
     spec_accept_length: float = 0.0
+    spec_accept_rate: float = 0.0
+    # Retract
+    num_retracted_reqs: int = 0
+    num_paused_reqs: int = 0
     # PD disaggregation
     num_prefill_prealloc_queue_reqs: int = 0
@@ -159,11 +142,6 @@ class SchedulerStats:
     kv_transfer_speed_gb_s: float = 0.0
     kv_transfer_latency_ms: float = 0.0
-    # Retract
-    total_retracted_reqs: int = 0
-    num_retracted_reqs: int = 0
-    num_paused_reqs: int = 0
     # Utilization
     utilization: float = 0.0
     max_running_requests_under_SLO: Optional[int] = None
@@ -172,6 +150,9 @@ class SchedulerStats:
     engine_startup_time: float = 0.0
     engine_load_weights_time: float = 0.0
+    # CUDA graph
+    is_cuda_graph: float = 0.0
 class SchedulerMetricsCollector:
@@ -200,6 +181,12 @@ class SchedulerMetricsCollector:
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
+        self.pending_prealloc_token_usage = Gauge(
+            name="sglang:pending_prealloc_token_usage",
+            documentation="The token usage for pending preallocated tokens (not preallocated yet).",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
         self.swa_token_usage = Gauge(
             name="sglang:swa_token_usage",
             documentation="The token usage for SWA layers.",
@@ -230,12 +217,6 @@ class SchedulerMetricsCollector:
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
-        self.avg_request_queue_latency = Gauge(
-            name="sglang:avg_request_queue_latency",
-            documentation="The average request queue latency for the last batch of requests in seconds.",
-            labelnames=labels.keys(),
-            multiprocess_mode="mostrecent",
-        )
         self.cache_hit_rate = Gauge(
             name="sglang:cache_hit_rate",
             documentation="The prefix cache hit rate.",
@@ -250,6 +231,24 @@ class SchedulerMetricsCollector:
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
+        self.spec_accept_rate = Gauge(
+            name="sglang:spec_accept_rate",
+            documentation="The average acceptance rate of speculative decoding (`accepted tokens / total draft tokens` in batch).",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        # Retract
+        self.num_retracted_reqs = Gauge(
+            name="sglang:num_retracted_reqs",
+            documentation="The number of retracted requests.",
+            labelnames=labels.keys(),
+        )
+        self.num_paused_reqs = Gauge(
+            name="sglang:num_paused_reqs",
+            documentation="The number of paused requests by async weight sync.",
+            labelnames=labels.keys(),
+        )
         # PD disaggregation
         self.num_prefill_prealloc_queue_reqs = Gauge(
@@ -299,24 +298,6 @@ class SchedulerMetricsCollector:
             multiprocess_mode="mostrecent",
         )
-        # Retract
-        self.total_retracted_reqs = Gauge(
-            name="sglang:total_retracted_reqs",
-            documentation="The total number of retracted requests due to kvcache full.",
-            labelnames=labels.keys(),
-            multiprocess_mode="mostrecent",
-        )
-        self.num_retracted_reqs = Gauge(
-            name="sglang:num_retracted_reqs",
-            documentation="The number of retracted requests.",
-            labelnames=labels.keys(),
-        )
-        self.num_paused_reqs = Gauge(
-            name="sglang:num_paused_reqs",
-            documentation="The number of paused requests by async weight sync.",
-            labelnames=labels.keys(),
-        )
         # Utilization
         self.utilization = Gauge(
             name="sglang:utilization",
@@ -347,7 +328,7 @@ class SchedulerMetricsCollector:
         # Additional queueing time histogram
         self.queue_time = Histogram(
-            name="sglang:queue_time_s",
+            name="sglang:queue_time_seconds",
             documentation="Histogram of queueing time in seconds.",
             labelnames=labels.keys(),
             buckets=[
@@ -513,11 +494,26 @@ class SchedulerMetricsCollector:
             buckets=tree_traversal_time_buckets,
         )
+        self.per_stage_req_latency_seconds = Histogram(
+            name="sglang:per_stage_req_latency_seconds",
+            documentation="The latency of each stage of requests.",
+            # captures latency in range [1ms - ~1191s]
+            buckets=exponential_buckets(start=0.001, width=1.62, length=30),
+            labelnames=list(labels.keys()) + ["stage"],
+        )
+        self.is_cuda_graph = Gauge(
+            name="sglang:is_cuda_graph",
+            documentation="Whether the batch is using CUDA graph.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)
-    def log_histogram(self, histogram, data: Union[int, float]) -> None:
+    def _log_histogram(self, histogram, data: Union[int, float]) -> None:
         histogram.labels(**self.labels).observe(data)
     def increment_bootstrap_failed_reqs(self) -> None:
@@ -526,10 +522,20 @@ class SchedulerMetricsCollector:
     def increment_transfer_failed_reqs(self) -> None:
         self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
+    def observe_per_stage_req_latency(self, stage: str, latency: float) -> None:
+        labels_with_stage = {**self.labels, "stage": stage}
+        self.per_stage_req_latency_seconds.labels(**labels_with_stage).observe(latency)
+    def observe_queue_time(self, latency: float) -> None:
+        self._log_histogram(self.queue_time, latency)
     def log_stats(self, stats: SchedulerStats) -> None:
         self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
         self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
         self._log_gauge(self.token_usage, stats.token_usage)
+        self._log_gauge(
+            self.pending_prealloc_token_usage, stats.pending_prealloc_token_usage
+        )
         self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
         self._log_gauge(self.gen_throughput, stats.gen_throughput)
         self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
@@ -538,10 +544,10 @@ class SchedulerMetricsCollector:
             self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
         )
         self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
-        self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
         # Speculative decoding
         self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
+        self._log_gauge(self.spec_accept_rate, stats.spec_accept_rate)
         # PD disaggregation
         self._log_gauge(
@@ -560,7 +566,6 @@ class SchedulerMetricsCollector:
         self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
         # Retract
-        self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
         self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
         self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
@@ -579,24 +584,27 @@ class SchedulerMetricsCollector:
                 self.engine_load_weights_time, stats.engine_load_weights_time
             )
+        # CUDA graph
+        self._log_gauge(self.is_cuda_graph, stats.is_cuda_graph)
         self.last_log_time = time.perf_counter()
     def log_grammar_stats(self, grammar_stats) -> None:
         # Duck-typed GrammarStats to avoid cross-package dependency
         if getattr(grammar_stats, "compilation_time", None) is not None:
-            self.log_histogram(
+            self._log_histogram(
                 self.grammar_compilation_time, grammar_stats.compilation_time
             )
         if getattr(grammar_stats, "schema_count", None) is not None:
-            self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
+            self._log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
         if getattr(grammar_stats, "ebnf_size", None) is not None:
-            self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
+            self._log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
         tree_times = getattr(grammar_stats, "tree_traversal_time", None)
         if tree_times:
             max_time = max(tree_times)
             avg_time = sum(tree_times) / len(tree_times)
-            self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
-            self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
+            self._log_histogram(self.grammar_tree_traversal_time_max, max_time)
+            self._log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
         if getattr(grammar_stats, "is_cache_hit", False):
             self.num_grammar_cache_hit.labels(**self.labels).inc(1)
         if getattr(grammar_stats, "is_grammar_aborted", False):
@@ -702,7 +710,7 @@ class TokenizerMetricsCollector:
         )
         self.num_aborted_requests_total = Counter(
-            name="sglang:num_aborted_requests",
+            name="sglang:num_aborted_requests_total",
             documentation="Number of requests aborted.",
             labelnames=labels.keys(),
         )
@@ -789,7 +797,7 @@ class TokenizerMetricsCollector:
             buckets=bucket_time_to_first_token,
         )
-        self.histogram_inter_token_latency_seconds = Histogram(
+        self.histogram_inter_token_latency = Histogram(
             name="sglang:inter_token_latency_seconds",
             documentation="Histogram of inter-token latency in seconds.",
             labelnames=labels.keys(),
@@ -803,14 +811,6 @@ class TokenizerMetricsCollector:
             buckets=bucket_e2e_request_latency,
         )
-        # Offline batch specific TTFB histogram
-        self.histogram_time_to_first_token_offline_batch = Histogram(
-            name="sglang:time_to_first_token_seconds_offline_batch",
-            documentation="Histogram of time to first token in seconds for offline batch requests.",
-            labelnames=labels.keys(),
-            buckets=bucket_time_to_first_token,
-        )
     def observe_one_finished_request(
         self,
         labels: Dict[str, str],
@@ -834,26 +834,19 @@ class TokenizerMetricsCollector:
                 float(generation_tokens)
             )
-    def observe_time_to_first_token(
-        self, labels: Dict[str, str], value: float, type: str = ""
-    ):
-        if type == "batch":
-            self.histogram_time_to_first_token_offline_batch.labels(**labels).observe(
-                value
-            )
-        else:
-            self.histogram_time_to_first_token.labels(**labels).observe(value)
+    def observe_time_to_first_token(self, labels: Dict[str, str], value: float):
+        self.histogram_time_to_first_token.labels(**labels).observe(value)
     def check_time_to_first_token_straggler(self, value: float) -> bool:
         his = self.histogram_time_to_first_token.labels(**self.labels)
         total_observations = sum(bucket._value for bucket in his._buckets)
-        if total_observations < 100:
+        if total_observations < 1000:
             return False
-        p99_threshold = total_observations * 0.99
+        p999_threshold = total_observations * 0.999
         cumulative_count = 0
         for i, bucket in enumerate(his._buckets):
             cumulative_count += bucket._value
-            if cumulative_count > p99_threshold:
+            if cumulative_count > p999_threshold:
                 return value >= his._upper_bounds[i]
         return False
@@ -864,7 +857,7 @@ class TokenizerMetricsCollector:
         # A faster version of the Histogram::observe which observes multiple values at the same time.
         # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
-        his = self.histogram_inter_token_latency_seconds.labels(**labels)
+        his = self.histogram_inter_token_latency.labels(**labels)
         his._sum.inc(internval)
         for i, bound in enumerate(his._upper_bounds):
@@ -872,8 +865,8 @@ class TokenizerMetricsCollector:
                 his._buckets[i].inc(num_new_tokens)
                 break
-    def observe_one_aborted_request(self):
-        self.num_aborted_requests_total.labels(**self.labels).inc(1)
+    def observe_one_aborted_request(self, labels: Dict[str, str]):
+        self.num_aborted_requests_total.labels(**labels).inc(1)
 @dataclass

sglang/srt/metrics/func_timer.py CHANGED Viewed

@@ -18,7 +18,9 @@ Records the latency of some functions
 import asyncio
 import time
 from functools import wraps
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, Optional
+from sglang.srt.metrics.utils import exponential_buckets
 enable_metrics = False
@@ -42,13 +44,6 @@ def enable_func_timer():
 FUNC_LATENCY = None
-def exponential_buckets(start: float, width: float, length: int) -> List[float]:
-    buckets = []
-    for i in range(length):
-        buckets.append(start * (width**i))
-    return buckets
 def time_func_latency(
     func: Callable = None, name: Optional[str] = None
 ) -> Callable[..., Any]:

sglang/srt/metrics/utils.py CHANGED Viewed

@@ -44,5 +44,12 @@ def generate_buckets(
         return two_sides_exponential_buckets(float(middle), float(base), int(count))
     if rule == "default":
         return sorted(set(default_buckets))
-    assert rule == "customer"
+    assert rule == "custom"
     return sorted(set([float(x) for x in buckets_rule[1:]]))
+def exponential_buckets(start: float, width: float, length: int) -> List[float]:
+    buckets = []
+    for i in range(length):
+        buckets.append(start * (width**i))
+    return buckets

sglang/srt/model_executor/cpu_graph_runner.py CHANGED Viewed

@@ -34,7 +34,6 @@ from sglang.srt.model_executor.forward_batch_info import (
     ForwardMode,
     PPProxyTensors,
 )
-from sglang.srt.patch_torch import monkey_patch_torch_compile
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
     log_info_on_rank0,
@@ -43,6 +42,7 @@ from sglang.srt.utils import (
     require_mlp_sync,
     require_mlp_tp_gather,
 )
+from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
 logger = logging.getLogger(__name__)
@@ -607,7 +607,7 @@ class CPUGraphRunner:
     def get_spec_info(self, num_tokens: int):
         spec_info = None
         if self.model_runner.spec_algorithm.is_eagle():
-            from sglang.srt.speculative.eagle_utils import EagleVerifyInput
+            from sglang.srt.speculative.eagle_info import EagleVerifyInput
             if self.model_runner.is_draft_worker:
                 raise RuntimeError("This should not happen.")

sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl