PyPI - sglang - Versions diffs - 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

sglang 0.5.2rc1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +330 -31
sglang/bench_serving.py +267 -32
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/load_config.py +9 -0
sglang/srt/configs/model_config.py +181 -82
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +49 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +30 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +21 -6
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +279 -108
sglang/srt/disaggregation/decode.py +71 -19
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +55 -537
sglang/srt/disaggregation/nixl/conn.py +326 -53
sglang/srt/disaggregation/prefill.py +36 -17
sglang/srt/disaggregation/utils.py +40 -54
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +192 -113
sglang/srt/entrypoints/engine.py +59 -18
sglang/srt/entrypoints/grpc_request_manager.py +855 -0
sglang/srt/entrypoints/grpc_server.py +810 -0
sglang/srt/entrypoints/http_server.py +132 -57
sglang/srt/entrypoints/openai/protocol.py +115 -7
sglang/srt/entrypoints/openai/serving_base.py +65 -3
sglang/srt/entrypoints/openai/serving_chat.py +207 -58
sglang/srt/entrypoints/openai/serving_completions.py +17 -4
sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +49 -4
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +38 -8
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +9 -2
sglang/srt/function_call/glm4_moe_detector.py +4 -4
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
sglang/srt/layers/activation.py +143 -9
sglang/srt/layers/attention/aiter_backend.py +106 -82
sglang/srt/layers/attention/ascend_backend.py +115 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +118 -198
sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
sglang/srt/layers/attention/mamba/mamba.py +629 -0
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +57 -7
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +53 -7
sglang/srt/layers/dp_attention.py +41 -2
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +34 -15
sglang/srt/layers/linear.py +55 -7
sglang/srt/layers/logits_processor.py +44 -12
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
sglang/srt/layers/moe/ep_moe/layer.py +256 -63
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +22 -7
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +78 -49
sglang/srt/layers/quantization/fp8_utils.py +51 -32
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +225 -57
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +77 -42
sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +26 -17
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +91 -41
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +78 -49
sglang/srt/layers/sampler.py +213 -21
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +99 -5
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +8 -3
sglang/srt/lora/lora_manager.py +52 -118
sglang/srt/lora/mem_pool.py +25 -11
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +22 -11
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +215 -314
sglang/srt/managers/data_parallel_controller.py +115 -80
sglang/srt/managers/detokenizer_manager.py +19 -15
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +340 -109
sglang/srt/managers/mm_utils.py +44 -6
sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +240 -138
sglang/srt/managers/schedule_policy.py +147 -19
sglang/srt/managers/scheduler.py +501 -304
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
sglang/srt/managers/tokenizer_manager.py +321 -632
sglang/srt/managers/tp_worker.py +81 -22
sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +15 -21
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +58 -34
sglang/srt/mem_cache/hiradix_cache.py +227 -80
sglang/srt/mem_cache/memory_pool.py +535 -58
sglang/srt/mem_cache/memory_pool_host.py +239 -223
sglang/srt/mem_cache/radix_cache.py +222 -73
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +25 -36
sglang/srt/metrics/collector.py +519 -132
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +55 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +52 -37
sglang/srt/model_executor/forward_batch_info.py +98 -57
sglang/srt/model_executor/model_runner.py +433 -158
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +133 -5
sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/model_loader/weight_utils.py +158 -3
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +820 -217
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +833 -152
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -2
sglang/srt/models/glm4_moe.py +14 -5
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +5 -3
sglang/srt/models/glm4v_moe.py +4 -1
sglang/srt/models/gpt_oss.py +8 -31
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +3 -3
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +40 -4
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2_5_vl.py +29 -5
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +124 -14
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +26 -5
sglang/srt/models/qwen3_moe.py +71 -12
sglang/srt/models/qwen3_next.py +1069 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +10 -3
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +98 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +153 -129
sglang/srt/multimodal/processors/qwen_vl.py +23 -6
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/offloader.py +27 -3
sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
sglang/srt/sampling/sampling_batch_info.py +38 -17
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +1030 -254
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
sglang/srt/speculative/eagle_worker.py +253 -136
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +52 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +578 -0
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +445 -77
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +2 -2
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +5 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +77 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +383 -5
sglang/utils.py +22 -1
sglang/version.py +1 -1
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{conversation.py → parser/conversation.py} +0 -0
/sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0

sglang/srt/metrics/collector.py CHANGED Viewed

@@ -12,12 +12,13 @@
 # limitations under the License.
 # ==============================================================================
 """Utilities for Prometheus Metrics Collection."""
 import time
-from dataclasses import dataclass
-from enum import Enum
+from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
+from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_bool_env_var
 SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
@@ -33,6 +34,7 @@ class TimeStats:
     Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
     """
+    disagg_mode: DisaggregationMode = DisaggregationMode.NULL
     lb_entry_time: float = 0.0
     wait_queue_entry_time: float = 0.0
     forward_entry_time: float = 0.0
@@ -42,17 +44,11 @@ class TimeStats:
     decode_prealloc_queue_entry_time: float = 0.0
     decode_transfer_queue_entry_time: float = 0.0
-    class RequestType(Enum):
-        UNIFIED = "unified"
-        PREFILL = "prefill"
-        DECODE = "decode"
-        INVALID = "invalid"
-    def __str__(self) -> str:
-        # if unified
-        _type = self.get_type()
+    def get_queueing_time(self) -> float:
+        return self.forward_entry_time - self.wait_queue_entry_time
-        if _type == self.RequestType.UNIFIED:
+    def convert_to_duration(self) -> str:
+        if self.disagg_mode == DisaggregationMode.NULL:
             queue_duration = self.forward_entry_time - self.wait_queue_entry_time
             forward_duration = self.completion_time - self.forward_entry_time
@@ -61,30 +57,28 @@ class TimeStats:
                     queue_duration >= 0 and forward_duration >= 0
                 ), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
-            return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
-        elif _type == self.RequestType.PREFILL:
+            return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time:.3f}"
+        elif self.disagg_mode == DisaggregationMode.PREFILL:
             bootstrap_duration = (
                 self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
             )
             queue_duration = self.forward_entry_time - self.wait_queue_entry_time
             forward_duration = self.completion_time - self.forward_entry_time
             if SGLANG_TEST_REQUEST_TIME_STATS:
-                assert (
-                    bootstrap_duration >= 0
-                    and queue_duration >= 0
-                    and forward_duration >= 0
-                ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
-            return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time}"
-        # if decode
-        elif _type == self.RequestType.DECODE:
+                if self.wait_queue_entry_time > 0:
+                    assert (
+                        bootstrap_duration >= 0
+                        and queue_duration >= 0
+                        and forward_duration >= 0
+                    ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
+            return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time:.3f}"
+        elif self.disagg_mode == DisaggregationMode.DECODE:
             prealloc_duration = (
                 self.decode_transfer_queue_entry_time
                 - self.decode_prealloc_queue_entry_time
             )
             transfer_duration = (
                 self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
             )
@@ -92,67 +86,74 @@ class TimeStats:
             forward_duration = self.completion_time - self.forward_entry_time
             if SGLANG_TEST_REQUEST_TIME_STATS:
-                assert (
-                    prealloc_duration >= 0
-                    and transfer_duration >= 0
-                    and queue_duration >= 0
-                    and forward_duration >= 0
-                ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
-            return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time}"
+                if self.wait_queue_entry_time > 0:
+                    assert (
+                        prealloc_duration >= 0
+                        and transfer_duration >= 0
+                        and queue_duration >= 0
+                        and forward_duration >= 0
+                    ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0. {self=}"
+            return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time:.3f}"
         else:
-            return "Invalid Time Stats"
+            return "Unknown Time Stats"
     def format_duration(self, duration: float) -> str:
         return f"{duration * 1e3:.2f}ms"
-    def get_type(self) -> RequestType:
-        """Determine the type of request based on timestamp values."""
-        if (
-            self.prefill_bootstrap_queue_entry_time == 0.0
-            and self.prefill_transfer_queue_entry_time == 0.0
-            and self.decode_prealloc_queue_entry_time == 0.0
-            and self.decode_transfer_queue_entry_time == 0.0
-        ):
-            return self.RequestType.UNIFIED
-        elif (
-            self.prefill_bootstrap_queue_entry_time > 0.0
-            and self.prefill_transfer_queue_entry_time > 0.0
-        ):
-            return self.RequestType.PREFILL
-        elif (
-            self.decode_prealloc_queue_entry_time > 0.0
-            and self.decode_transfer_queue_entry_time > 0.0
-            and self.wait_queue_entry_time > 0.0
-        ):
-            return self.RequestType.DECODE
+    def disagg_mode_str(self) -> str:
+        if self.disagg_mode == DisaggregationMode.NULL:
+            return "unified"
+        elif self.disagg_mode == DisaggregationMode.DECODE:
+            return "decode"
+        elif self.disagg_mode == DisaggregationMode.PREFILL:
+            return "prefill"
         else:
-            return self.RequestType.INVALID
+            return "unknown"
 @dataclass
 class SchedulerStats:
+    # Basics
     num_running_reqs: int = 0
     num_used_tokens: int = 0
     token_usage: float = 0.0
+    swa_token_usage: float = 0.0
     gen_throughput: float = 0.0
     num_queue_reqs: int = 0
-    cache_hit_rate: float = 0.0
     num_grammar_queue_reqs: int = 0
+    num_running_reqs_offline_batch: int = 0
+    cache_hit_rate: float = 0.0
+    # Speculative decoding
     spec_accept_length: float = 0.0
-    avg_request_queue_latency: float = 0.0
+    # Retract
+    num_retracted_reqs: int = 0
+    num_paused_reqs: int = 0
+    # PD disaggregation
     num_prefill_prealloc_queue_reqs: int = 0
     num_prefill_inflight_queue_reqs: int = 0
     num_decode_prealloc_queue_reqs: int = 0
     num_decode_transfer_queue_reqs: int = 0
-    total_retracted_reqs: int = 0
+    kv_transfer_speed_gb_s: float = 0.0
+    kv_transfer_latency_ms: float = 0.0
+    # Utilization
+    utilization: float = 0.0
+    max_running_requests_under_SLO: Optional[int] = None
+    # Engine startup
+    engine_startup_time: float = 0.0
+    engine_load_weights_time: float = 0.0
 class SchedulerMetricsCollector:
     def __init__(self, labels: Dict[str, str]) -> None:
         # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
-        from prometheus_client import Counter, Gauge
+        from prometheus_client import Counter, Gauge, Histogram
         self.labels = labels
         self.last_log_time = time.perf_counter()
@@ -163,42 +164,48 @@ class SchedulerMetricsCollector:
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.num_used_tokens = Gauge(
             name="sglang:num_used_tokens",
             documentation="The number of used tokens.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.token_usage = Gauge(
             name="sglang:token_usage",
             documentation="The token usage.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
+        self.swa_token_usage = Gauge(
+            name="sglang:swa_token_usage",
+            documentation="The token usage for SWA layers.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
         self.gen_throughput = Gauge(
             name="sglang:gen_throughput",
             documentation="The generation throughput (token/s).",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.num_queue_reqs = Gauge(
             name="sglang:num_queue_reqs",
             documentation="The number of requests in the waiting queue.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.num_grammar_queue_reqs = Gauge(
             name="sglang:num_grammar_queue_reqs",
             documentation="The number of requests in the grammar waiting queue.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
+        self.num_running_reqs_offline_batch = Gauge(
+            name="sglang:num_running_reqs_offline_batch",
+            documentation="The number of running low-priority offline batch requests(label is 'batch').",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
         self.cache_hit_rate = Gauge(
             name="sglang:cache_hit_rate",
             documentation="The prefix cache hit rate.",
@@ -206,6 +213,7 @@ class SchedulerMetricsCollector:
             multiprocess_mode="mostrecent",
         )
+        # Speculative decoding
         self.spec_accept_length = Gauge(
             name="sglang:spec_accept_length",
             documentation="The average acceptance length of speculative decoding.",
@@ -213,83 +221,307 @@ class SchedulerMetricsCollector:
             multiprocess_mode="mostrecent",
         )
-        self.avg_request_queue_latency = Gauge(
-            name="sglang:avg_request_queue_latency",
-            documentation="The average request queue latency for the last batch of requests in seconds.",
+        # Retract
+        self.num_retracted_reqs = Gauge(
+            name="sglang:num_retracted_reqs",
+            documentation="The number of retracted requests.",
             labelnames=labels.keys(),
-            multiprocess_mode="mostrecent",
         )
-        self.total_retracted_reqs = Gauge(
-            name="sglang:total_retracted_reqs",
-            documentation="The total number of retracted requests due to kvcache full.",
+        self.num_paused_reqs = Gauge(
+            name="sglang:num_paused_reqs",
+            documentation="The number of paused requests by async weight sync.",
             labelnames=labels.keys(),
-            multiprocess_mode="mostrecent",
         )
-        # Disaggregation queue metrics
+        # PD disaggregation
         self.num_prefill_prealloc_queue_reqs = Gauge(
             name="sglang:num_prefill_prealloc_queue_reqs",
             documentation="The number of requests in the prefill prealloc queue.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.num_prefill_inflight_queue_reqs = Gauge(
             name="sglang:num_prefill_inflight_queue_reqs",
             documentation="The number of requests in the prefill inflight queue.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.num_decode_prealloc_queue_reqs = Gauge(
             name="sglang:num_decode_prealloc_queue_reqs",
             documentation="The number of requests in the decode prealloc queue.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.num_decode_transfer_queue_reqs = Gauge(
             name="sglang:num_decode_transfer_queue_reqs",
             documentation="The number of requests in the decode transfer queue.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.num_bootstrap_failed_reqs = Counter(
-            name="sglang:num_bootstrap_failed_reqs",
+            name="sglang:num_bootstrap_failed_reqs_total",
             documentation="The number of bootstrap failed requests.",
             labelnames=labels.keys(),
         )
         self.num_transfer_failed_reqs = Counter(
-            name="sglang:num_transfer_failed_reqs",
+            name="sglang:num_transfer_failed_reqs_total",
             documentation="The number of transfer failed requests.",
             labelnames=labels.keys(),
         )
+        self.kv_transfer_speed_gb_s = Gauge(
+            name="sglang:kv_transfer_speed_gb_s",
+            documentation="The transfer speed of the KV cache in GB/s.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.kv_transfer_latency_ms = Gauge(
+            name="sglang:kv_transfer_latency_ms",
+            documentation="The transfer latency of the KV cache in ms.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        # Utilization
+        self.utilization = Gauge(
+            name="sglang:utilization",
+            documentation="The utilization.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.max_running_requests_under_SLO = Gauge(
+            name="sglang:max_running_requests_under_SLO",
+            documentation="The maximum number of running requests under SLO.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        # Engine startup
+        self.engine_startup_time = Gauge(
+            name="sglang:engine_startup_time",
+            documentation="The time taken for the engine to start up.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.engine_load_weights_time = Gauge(
+            name="sglang:engine_load_weights_time",
+            documentation="The time taken for the engine to load weights.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        # Additional queueing time histogram
+        self.queue_time = Histogram(
+            name="sglang:queue_time_seconds",
+            documentation="Histogram of queueing time in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.0,
+                0.1,
+                0.2,
+                0.5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                10,
+                15,
+                20,
+                30,
+                40,
+                50,
+                60,
+                70,
+                80,
+                90,
+                100,
+                200,
+                300,
+                400,
+                500,
+                600,
+                700,
+                800,
+                900,
+                1000,
+                1200,
+                1400,
+                1600,
+                1800,
+                2000,
+                2500,
+                3000,
+            ],
+        )
+        # Grammar metrics
+        self.grammar_compilation_time = Histogram(
+            name="sglang:grammar_compilation_time_seconds",
+            documentation="Histogram of grammar compilation time in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.0,
+                0.01,
+                0.02,
+                0.05,
+                0.1,
+                0.2,
+                0.5,
+                1,
+                2,
+                5,
+                10,
+                20,
+                30,
+                60,
+                90,
+                120,
+                240,
+            ],
+        )
+        self.num_grammar_cache_hit = Counter(
+            name="sglang:num_grammar_cache_hit_total",
+            documentation="Number of grammar cache hits.",
+            labelnames=labels.keys(),
+        )
+        self.num_grammar_aborted = Counter(
+            name="sglang:num_grammar_aborted_total",
+            documentation="Number of grammar aborted requests.",
+            labelnames=labels.keys(),
+        )
+        self.num_grammar_total = Counter(
+            name="sglang:num_grammar_total",
+            documentation="Number of the total grammar requests.",
+            labelnames=labels.keys(),
+        )
+        self.grammar_schema_count = Histogram(
+            name="sglang:grammar_schema_count",
+            documentation="Histogram of grammar schema count.",
+            labelnames=labels.keys(),
+            buckets=[
+                0,
+                1,
+                2,
+                5,
+                10,
+                20,
+                30,
+                40,
+                60,
+                80,
+                100,
+                120,
+                140,
+                160,
+                180,
+                200,
+                300,
+                400,
+                500,
+                700,
+                1000,
+            ],
+        )
+        self.grammar_ebnf_size = Histogram(
+            name="sglang:grammar_ebnf_size",
+            documentation="Histogram of grammar EBNF size.",
+            labelnames=labels.keys(),
+            buckets=[
+                0,
+                50,
+                100,
+                200,
+                300,
+                500,
+                1000,
+                2000,
+                3000,
+                5000,
+                10000,
+                20000,
+                30000,
+                50000,
+                100000,
+            ],
+        )
+        tree_traversal_time_buckets = [
+            0.0,
+            0.01,
+            0.02,
+            0.05,
+            0.1,
+            0.2,
+            0.5,
+            1,
+            2,
+            5,
+            10,
+            15,
+            30,
+            60,
+            90,
+            120,
+            240,
+        ]
+        self.grammar_tree_traversal_time_avg = Histogram(
+            name="sglang:grammar_tree_traversal_time_avg",
+            documentation="Histogram of average grammar tree traversal time in seconds.",
+            labelnames=labels.keys(),
+            buckets=tree_traversal_time_buckets,
+        )
+        self.grammar_tree_traversal_time_max = Histogram(
+            name="sglang:grammar_tree_traversal_time_max",
+            documentation="Histogram of max grammar tree traversal time in seconds.",
+            labelnames=labels.keys(),
+            buckets=tree_traversal_time_buckets,
+        )
+        self.per_stage_req_latency_seconds = Histogram(
+            name="sglang:per_stage_req_latency_seconds",
+            documentation="The latency of each stage of requests.",
+            # captures latency in range [1ms - ~1191s]
+            buckets=exponential_buckets(start=0.001, width=1.62, length=30),
+            labelnames=list(labels.keys()) + ["stage"],
+        )
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)
+    def _log_histogram(self, histogram, data: Union[int, float]) -> None:
+        histogram.labels(**self.labels).observe(data)
     def increment_bootstrap_failed_reqs(self) -> None:
         self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
     def increment_transfer_failed_reqs(self) -> None:
         self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
+    def observe_per_stage_req_latency(self, stage: str, latency: float) -> None:
+        labels_with_stage = {**self.labels, "stage": stage}
+        self.per_stage_req_latency_seconds.labels(**labels_with_stage).observe(latency)
+    def observe_queue_time(self, latency: float) -> None:
+        self._log_histogram(self.queue_time, latency)
     def log_stats(self, stats: SchedulerStats) -> None:
         self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
         self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
         self._log_gauge(self.token_usage, stats.token_usage)
+        self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
         self._log_gauge(self.gen_throughput, stats.gen_throughput)
         self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
         self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
+        self._log_gauge(
+            self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
+        )
         self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
+        # Speculative decoding
         self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
-        self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
-        # Disaggregation metrics
+        # PD disaggregation
         self._log_gauge(
             self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
         )
@@ -302,14 +534,58 @@ class SchedulerMetricsCollector:
         self._log_gauge(
             self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
         )
+        self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
+        self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
+        # Retract
+        self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
+        self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
+        # Utilization
+        self._log_gauge(self.utilization, stats.utilization)
+        if stats.max_running_requests_under_SLO is not None:
+            self._log_gauge(
+                self.max_running_requests_under_SLO,
+                stats.max_running_requests_under_SLO,
+            )
+        # Engine startup time
+        self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
+        if stats.engine_load_weights_time is not None:
+            self._log_gauge(
+                self.engine_load_weights_time, stats.engine_load_weights_time
+            )
         self.last_log_time = time.perf_counter()
+    def log_grammar_stats(self, grammar_stats) -> None:
+        # Duck-typed GrammarStats to avoid cross-package dependency
+        if getattr(grammar_stats, "compilation_time", None) is not None:
+            self._log_histogram(
+                self.grammar_compilation_time, grammar_stats.compilation_time
+            )
+        if getattr(grammar_stats, "schema_count", None) is not None:
+            self._log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
+        if getattr(grammar_stats, "ebnf_size", None) is not None:
+            self._log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
+        tree_times = getattr(grammar_stats, "tree_traversal_time", None)
+        if tree_times:
+            max_time = max(tree_times)
+            avg_time = sum(tree_times) / len(tree_times)
+            self._log_histogram(self.grammar_tree_traversal_time_max, max_time)
+            self._log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
+        if getattr(grammar_stats, "is_cache_hit", False):
+            self.num_grammar_cache_hit.labels(**self.labels).inc(1)
+        if getattr(grammar_stats, "is_grammar_aborted", False):
+            self.num_grammar_aborted.labels(**self.labels).inc(1)
+        self.num_grammar_total.labels(**self.labels).inc(1)
 class TokenizerMetricsCollector:
     def __init__(
         self,
-        labels: Dict[str, str],
+        server_args: Optional[ServerArgs] = None,
+        labels: Dict[str, str] = None,
         bucket_time_to_first_token: Optional[List[float]] = None,
         bucket_inter_token_latency: Optional[List[float]] = None,
         bucket_e2e_request_latency: Optional[List[float]] = None,
@@ -318,7 +594,7 @@ class TokenizerMetricsCollector:
         # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
         from prometheus_client import Counter, Histogram
-        self.labels = labels
+        self.labels = labels or {}
         self.collect_tokens_histogram = collect_tokens_histogram
         self.prompt_tokens_total = Counter(
@@ -334,7 +610,7 @@ class TokenizerMetricsCollector:
         )
         if collect_tokens_histogram:
-            bucket_prompt_tokens = [
+            default_bucket_prompt_tokens = [
                 100,
                 300,
                 500,
@@ -358,39 +634,30 @@ class TokenizerMetricsCollector:
                 30000,
                 35000,
                 40000,
+                66000,
+                99000,
+                132000,
+                300000,
+                600000,
+                900000,
+                1100000,
             ]
             self.prompt_tokens_histogram = Histogram(
                 name="sglang:prompt_tokens_histogram",
                 documentation="Histogram of prompt token length.",
                 labelnames=labels.keys(),
-                buckets=bucket_prompt_tokens,
+                buckets=generate_buckets(
+                    server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
+                ),
             )
-            bucket_generation_tokens = [
-                100,
-                300,
-                500,
-                1000,
-                1200,
-                1500,
-                1700,
-                2000,
-                2500,
-                3000,
-                3500,
-                4000,
-                4500,
-                5000,
-                6000,
-                7000,
-                8000,
-                9000,
-                10000,
-            ]
             self.generation_tokens_histogram = Histogram(
                 name="sglang:generation_tokens_histogram",
                 documentation="Histogram of generation token length.",
                 labelnames=labels.keys(),
-                buckets=bucket_generation_tokens,
+                buckets=generate_buckets(
+                    server_args.generation_tokens_buckets,
+                    default_bucket_prompt_tokens,
+                ),
             )
         self.cached_tokens_total = Counter(
@@ -412,7 +679,7 @@ class TokenizerMetricsCollector:
         )
         self.num_aborted_requests_total = Counter(
-            name="sglang:num_aborted_requests",
+            name="sglang:num_aborted_requests_total",
             documentation="Number of requests aborted.",
             labelnames=labels.keys(),
         )
@@ -459,7 +726,10 @@ class TokenizerMetricsCollector:
                 100,
                 200,
                 400,
-                800,
+                600,
+                1200,
+                1800,
+                2400,
             ]
         if bucket_inter_token_latency is None:
@@ -496,7 +766,7 @@ class TokenizerMetricsCollector:
             buckets=bucket_time_to_first_token,
         )
-        self.histogram_inter_token_latency_seconds = Histogram(
+        self.histogram_inter_token_latency = Histogram(
             name="sglang:inter_token_latency_seconds",
             documentation="Histogram of inter-token latency in seconds.",
             labelnames=labels.keys(),
@@ -510,38 +780,53 @@ class TokenizerMetricsCollector:
             buckets=bucket_e2e_request_latency,
         )
-    def _log_histogram(self, histogram, data: Union[int, float]) -> None:
-        histogram.labels(**self.labels).observe(data)
     def observe_one_finished_request(
         self,
+        labels: Dict[str, str],
         prompt_tokens: int,
         generation_tokens: int,
         cached_tokens: int,
         e2e_latency: float,
         has_grammar: bool,
     ):
-        self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
-        self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
+        self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
+        self.generation_tokens_total.labels(**labels).inc(generation_tokens)
         if cached_tokens > 0:
-            self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
-        self.num_requests_total.labels(**self.labels).inc(1)
+            self.cached_tokens_total.labels(**labels).inc(cached_tokens)
+        self.num_requests_total.labels(**labels).inc(1)
         if has_grammar:
-            self.num_so_requests_total.labels(**self.labels).inc(1)
-        self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
+            self.num_so_requests_total.labels(**labels).inc(1)
+        self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
         if self.collect_tokens_histogram:
-            self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
-            self._log_histogram(self.generation_tokens_histogram, generation_tokens)
-    def observe_time_to_first_token(self, value: float):
-        self.histogram_time_to_first_token.labels(**self.labels).observe(value)
+            self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
+            self.generation_tokens_histogram.labels(**labels).observe(
+                float(generation_tokens)
+            )
-    def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
+    def observe_time_to_first_token(self, labels: Dict[str, str], value: float):
+        self.histogram_time_to_first_token.labels(**labels).observe(value)
+    def check_time_to_first_token_straggler(self, value: float) -> bool:
+        his = self.histogram_time_to_first_token.labels(**self.labels)
+        total_observations = sum(bucket._value for bucket in his._buckets)
+        if total_observations < 1000:
+            return False
+        p999_threshold = total_observations * 0.999
+        cumulative_count = 0
+        for i, bucket in enumerate(his._buckets):
+            cumulative_count += bucket._value
+            if cumulative_count > p999_threshold:
+                return value >= his._upper_bounds[i]
+        return False
+    def observe_inter_token_latency(
+        self, labels: Dict[str, str], internval: float, num_new_tokens: int
+    ):
         adjusted_interval = internval / num_new_tokens
         # A faster version of the Histogram::observe which observes multiple values at the same time.
         # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
-        his = self.histogram_inter_token_latency_seconds.labels(**self.labels)
+        his = self.histogram_inter_token_latency.labels(**labels)
         his._sum.inc(internval)
         for i, bound in enumerate(his._upper_bounds):
@@ -549,5 +834,107 @@ class TokenizerMetricsCollector:
                 his._buckets[i].inc(num_new_tokens)
                 break
-    def observe_one_aborted_request(self):
-        self.num_aborted_requests_total.labels(**self.labels).inc(1)
+    def observe_one_aborted_request(self, labels: Dict[str, str]):
+        self.num_aborted_requests_total.labels(**labels).inc(1)
+@dataclass
+class StorageMetrics:
+    prefetch_pgs: List[int] = field(default_factory=list)
+    backup_pgs: List[int] = field(default_factory=list)
+    prefetch_bandwidth: List[float] = field(default_factory=list)
+    backup_bandwidth: List[float] = field(default_factory=list)
+class StorageMetricsCollector:
+    def __init__(
+        self,
+        labels: Dict[str, str],
+    ):
+        from prometheus_client import Counter, Histogram
+        self.labels = labels
+        self.prefetched_tokens_total = Counter(
+            name="sglang:prefetched_tokens_total",
+            documentation="Number of prefetched prompt tokens.",
+            labelnames=labels.keys(),
+        )
+        self.backuped_tokens_total = Counter(
+            name="sglang:backuped_tokens_total",
+            documentation="Number of backuped tokens.",
+            labelnames=labels.keys(),
+        )
+        bucket_io = [
+            1,
+            5,
+            10,
+            50,
+            100,
+        ]
+        bucket_bandwidth = [
+            0.1,
+            0.5,
+            1,
+            5,
+            10,
+            50,
+            100,
+        ]
+        self.histogram_prefetch_pgs = Histogram(
+            name="sglang:prefetch_pgs",
+            documentation="Histogram of prefetch pages of batches.",
+            labelnames=labels.keys(),
+            buckets=bucket_io,
+        )
+        self.histogram_backup_pgs = Histogram(
+            name="sglang:backup_pgs",
+            documentation="Histogram of backup pages of batches.",
+            labelnames=labels.keys(),
+            buckets=bucket_io,
+        )
+        self.histogram_prefetch_bandwidth = Histogram(
+            name="sglang:prefetch_bandwidth",
+            documentation="Histogram of prefetch bandwidth in GB/s.",
+            labelnames=labels.keys(),
+            buckets=bucket_bandwidth,
+        )
+        self.histogram_backup_bandwidth = Histogram(
+            name="sglang:backup_bandwidth",
+            documentation="Histogram of backup bandwidth in GB/s.",
+            labelnames=labels.keys(),
+            buckets=bucket_bandwidth,
+        )
+    def log_prefetched_tokens(self, prefetched_tokens: int):
+        if prefetched_tokens > 0:
+            self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
+    def log_backuped_tokens(self, backuped_tokens: int):
+        if backuped_tokens > 0:
+            self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
+    def _log_histogram(self, histogram, data: Union[int, float]):
+        histogram.labels(**self.labels).observe(data)
+    def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
+        if storage_metrics is None:
+            return
+        assert isinstance(storage_metrics, StorageMetrics)
+        for v in storage_metrics.prefetch_pgs:
+            self._log_histogram(self.histogram_prefetch_pgs, v)
+        for v in storage_metrics.backup_pgs:
+            self._log_histogram(self.histogram_backup_pgs, v)
+        for v in storage_metrics.prefetch_bandwidth:
+            self._log_histogram(self.histogram_prefetch_bandwidth, v)
+        for v in storage_metrics.backup_bandwidth:
+            self._log_histogram(self.histogram_backup_bandwidth, v)

sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

sglang 0.5.2rc1py3-none-any.whl → 0.5.3py3-none-any.whl