sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +192 -113
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +132 -57
- sglang/srt/entrypoints/openai/protocol.py +115 -7
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +207 -58
- sglang/srt/entrypoints/openai/serving_completions.py +17 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +49 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +106 -82
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +53 -7
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +225 -57
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +78 -49
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +215 -314
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +147 -19
- sglang/srt/managers/scheduler.py +501 -304
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +321 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +15 -21
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +58 -34
- sglang/srt/mem_cache/hiradix_cache.py +227 -80
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -223
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +519 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +55 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +98 -57
- sglang/srt/model_executor/model_runner.py +433 -158
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +833 -152
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +14 -5
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +124 -14
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +26 -5
- sglang/srt/models/qwen3_moe.py +71 -12
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +10 -3
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1030 -254
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +253 -136
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +445 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +22 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
sglang/srt/metrics/collector.py
CHANGED
@@ -12,12 +12,13 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
"""Utilities for Prometheus Metrics Collection."""
|
15
|
-
|
16
15
|
import time
|
17
|
-
from dataclasses import dataclass
|
18
|
-
from enum import Enum
|
16
|
+
from dataclasses import dataclass, field
|
19
17
|
from typing import Dict, List, Optional, Union
|
20
18
|
|
19
|
+
from sglang.srt.disaggregation.utils import DisaggregationMode
|
20
|
+
from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
|
21
|
+
from sglang.srt.server_args import ServerArgs
|
21
22
|
from sglang.srt.utils import get_bool_env_var
|
22
23
|
|
23
24
|
SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
|
@@ -33,6 +34,7 @@ class TimeStats:
|
|
33
34
|
Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
|
34
35
|
"""
|
35
36
|
|
37
|
+
disagg_mode: DisaggregationMode = DisaggregationMode.NULL
|
36
38
|
lb_entry_time: float = 0.0
|
37
39
|
wait_queue_entry_time: float = 0.0
|
38
40
|
forward_entry_time: float = 0.0
|
@@ -42,17 +44,11 @@ class TimeStats:
|
|
42
44
|
decode_prealloc_queue_entry_time: float = 0.0
|
43
45
|
decode_transfer_queue_entry_time: float = 0.0
|
44
46
|
|
45
|
-
|
46
|
-
|
47
|
-
PREFILL = "prefill"
|
48
|
-
DECODE = "decode"
|
49
|
-
INVALID = "invalid"
|
50
|
-
|
51
|
-
def __str__(self) -> str:
|
52
|
-
# if unified
|
53
|
-
_type = self.get_type()
|
47
|
+
def get_queueing_time(self) -> float:
|
48
|
+
return self.forward_entry_time - self.wait_queue_entry_time
|
54
49
|
|
55
|
-
|
50
|
+
def convert_to_duration(self) -> str:
|
51
|
+
if self.disagg_mode == DisaggregationMode.NULL:
|
56
52
|
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
57
53
|
forward_duration = self.completion_time - self.forward_entry_time
|
58
54
|
|
@@ -61,30 +57,28 @@ class TimeStats:
|
|
61
57
|
queue_duration >= 0 and forward_duration >= 0
|
62
58
|
), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
63
59
|
|
64
|
-
return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
|
65
|
-
elif
|
60
|
+
return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time:.3f}"
|
61
|
+
elif self.disagg_mode == DisaggregationMode.PREFILL:
|
66
62
|
bootstrap_duration = (
|
67
63
|
self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
|
68
64
|
)
|
69
|
-
|
70
65
|
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
71
|
-
|
72
66
|
forward_duration = self.completion_time - self.forward_entry_time
|
73
67
|
|
74
68
|
if SGLANG_TEST_REQUEST_TIME_STATS:
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
69
|
+
if self.wait_queue_entry_time > 0:
|
70
|
+
assert (
|
71
|
+
bootstrap_duration >= 0
|
72
|
+
and queue_duration >= 0
|
73
|
+
and forward_duration >= 0
|
74
|
+
), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
75
|
+
|
76
|
+
return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time:.3f}"
|
77
|
+
elif self.disagg_mode == DisaggregationMode.DECODE:
|
83
78
|
prealloc_duration = (
|
84
79
|
self.decode_transfer_queue_entry_time
|
85
80
|
- self.decode_prealloc_queue_entry_time
|
86
81
|
)
|
87
|
-
|
88
82
|
transfer_duration = (
|
89
83
|
self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
|
90
84
|
)
|
@@ -92,67 +86,74 @@ class TimeStats:
|
|
92
86
|
forward_duration = self.completion_time - self.forward_entry_time
|
93
87
|
|
94
88
|
if SGLANG_TEST_REQUEST_TIME_STATS:
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
89
|
+
if self.wait_queue_entry_time > 0:
|
90
|
+
assert (
|
91
|
+
prealloc_duration >= 0
|
92
|
+
and transfer_duration >= 0
|
93
|
+
and queue_duration >= 0
|
94
|
+
and forward_duration >= 0
|
95
|
+
), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0. {self=}"
|
96
|
+
|
97
|
+
return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time:.3f}"
|
103
98
|
else:
|
104
|
-
return "
|
99
|
+
return "Unknown Time Stats"
|
105
100
|
|
106
101
|
def format_duration(self, duration: float) -> str:
|
107
102
|
return f"{duration * 1e3:.2f}ms"
|
108
103
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
):
|
117
|
-
return self.RequestType.UNIFIED
|
118
|
-
elif (
|
119
|
-
self.prefill_bootstrap_queue_entry_time > 0.0
|
120
|
-
and self.prefill_transfer_queue_entry_time > 0.0
|
121
|
-
):
|
122
|
-
return self.RequestType.PREFILL
|
123
|
-
elif (
|
124
|
-
self.decode_prealloc_queue_entry_time > 0.0
|
125
|
-
and self.decode_transfer_queue_entry_time > 0.0
|
126
|
-
and self.wait_queue_entry_time > 0.0
|
127
|
-
):
|
128
|
-
return self.RequestType.DECODE
|
104
|
+
def disagg_mode_str(self) -> str:
|
105
|
+
if self.disagg_mode == DisaggregationMode.NULL:
|
106
|
+
return "unified"
|
107
|
+
elif self.disagg_mode == DisaggregationMode.DECODE:
|
108
|
+
return "decode"
|
109
|
+
elif self.disagg_mode == DisaggregationMode.PREFILL:
|
110
|
+
return "prefill"
|
129
111
|
else:
|
130
|
-
return
|
112
|
+
return "unknown"
|
131
113
|
|
132
114
|
|
133
115
|
@dataclass
|
134
116
|
class SchedulerStats:
|
117
|
+
# Basics
|
135
118
|
num_running_reqs: int = 0
|
136
119
|
num_used_tokens: int = 0
|
137
120
|
token_usage: float = 0.0
|
121
|
+
swa_token_usage: float = 0.0
|
138
122
|
gen_throughput: float = 0.0
|
139
123
|
num_queue_reqs: int = 0
|
140
|
-
cache_hit_rate: float = 0.0
|
141
124
|
num_grammar_queue_reqs: int = 0
|
125
|
+
num_running_reqs_offline_batch: int = 0
|
126
|
+
cache_hit_rate: float = 0.0
|
127
|
+
|
128
|
+
# Speculative decoding
|
142
129
|
spec_accept_length: float = 0.0
|
143
|
-
|
130
|
+
|
131
|
+
# Retract
|
132
|
+
num_retracted_reqs: int = 0
|
133
|
+
num_paused_reqs: int = 0
|
134
|
+
|
135
|
+
# PD disaggregation
|
144
136
|
num_prefill_prealloc_queue_reqs: int = 0
|
145
137
|
num_prefill_inflight_queue_reqs: int = 0
|
146
138
|
num_decode_prealloc_queue_reqs: int = 0
|
147
139
|
num_decode_transfer_queue_reqs: int = 0
|
148
|
-
|
140
|
+
kv_transfer_speed_gb_s: float = 0.0
|
141
|
+
kv_transfer_latency_ms: float = 0.0
|
142
|
+
|
143
|
+
# Utilization
|
144
|
+
utilization: float = 0.0
|
145
|
+
max_running_requests_under_SLO: Optional[int] = None
|
146
|
+
|
147
|
+
# Engine startup
|
148
|
+
engine_startup_time: float = 0.0
|
149
|
+
engine_load_weights_time: float = 0.0
|
149
150
|
|
150
151
|
|
151
152
|
class SchedulerMetricsCollector:
|
152
153
|
|
153
154
|
def __init__(self, labels: Dict[str, str]) -> None:
|
154
155
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
155
|
-
from prometheus_client import Counter, Gauge
|
156
|
+
from prometheus_client import Counter, Gauge, Histogram
|
156
157
|
|
157
158
|
self.labels = labels
|
158
159
|
self.last_log_time = time.perf_counter()
|
@@ -163,42 +164,48 @@ class SchedulerMetricsCollector:
|
|
163
164
|
labelnames=labels.keys(),
|
164
165
|
multiprocess_mode="mostrecent",
|
165
166
|
)
|
166
|
-
|
167
167
|
self.num_used_tokens = Gauge(
|
168
168
|
name="sglang:num_used_tokens",
|
169
169
|
documentation="The number of used tokens.",
|
170
170
|
labelnames=labels.keys(),
|
171
171
|
multiprocess_mode="mostrecent",
|
172
172
|
)
|
173
|
-
|
174
173
|
self.token_usage = Gauge(
|
175
174
|
name="sglang:token_usage",
|
176
175
|
documentation="The token usage.",
|
177
176
|
labelnames=labels.keys(),
|
178
177
|
multiprocess_mode="mostrecent",
|
179
178
|
)
|
180
|
-
|
179
|
+
self.swa_token_usage = Gauge(
|
180
|
+
name="sglang:swa_token_usage",
|
181
|
+
documentation="The token usage for SWA layers.",
|
182
|
+
labelnames=labels.keys(),
|
183
|
+
multiprocess_mode="mostrecent",
|
184
|
+
)
|
181
185
|
self.gen_throughput = Gauge(
|
182
186
|
name="sglang:gen_throughput",
|
183
187
|
documentation="The generation throughput (token/s).",
|
184
188
|
labelnames=labels.keys(),
|
185
189
|
multiprocess_mode="mostrecent",
|
186
190
|
)
|
187
|
-
|
188
191
|
self.num_queue_reqs = Gauge(
|
189
192
|
name="sglang:num_queue_reqs",
|
190
193
|
documentation="The number of requests in the waiting queue.",
|
191
194
|
labelnames=labels.keys(),
|
192
195
|
multiprocess_mode="mostrecent",
|
193
196
|
)
|
194
|
-
|
195
197
|
self.num_grammar_queue_reqs = Gauge(
|
196
198
|
name="sglang:num_grammar_queue_reqs",
|
197
199
|
documentation="The number of requests in the grammar waiting queue.",
|
198
200
|
labelnames=labels.keys(),
|
199
201
|
multiprocess_mode="mostrecent",
|
200
202
|
)
|
201
|
-
|
203
|
+
self.num_running_reqs_offline_batch = Gauge(
|
204
|
+
name="sglang:num_running_reqs_offline_batch",
|
205
|
+
documentation="The number of running low-priority offline batch requests(label is 'batch').",
|
206
|
+
labelnames=labels.keys(),
|
207
|
+
multiprocess_mode="mostrecent",
|
208
|
+
)
|
202
209
|
self.cache_hit_rate = Gauge(
|
203
210
|
name="sglang:cache_hit_rate",
|
204
211
|
documentation="The prefix cache hit rate.",
|
@@ -206,6 +213,7 @@ class SchedulerMetricsCollector:
|
|
206
213
|
multiprocess_mode="mostrecent",
|
207
214
|
)
|
208
215
|
|
216
|
+
# Speculative decoding
|
209
217
|
self.spec_accept_length = Gauge(
|
210
218
|
name="sglang:spec_accept_length",
|
211
219
|
documentation="The average acceptance length of speculative decoding.",
|
@@ -213,83 +221,307 @@ class SchedulerMetricsCollector:
|
|
213
221
|
multiprocess_mode="mostrecent",
|
214
222
|
)
|
215
223
|
|
216
|
-
|
217
|
-
|
218
|
-
|
224
|
+
# Retract
|
225
|
+
self.num_retracted_reqs = Gauge(
|
226
|
+
name="sglang:num_retracted_reqs",
|
227
|
+
documentation="The number of retracted requests.",
|
219
228
|
labelnames=labels.keys(),
|
220
|
-
multiprocess_mode="mostrecent",
|
221
229
|
)
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
documentation="The total number of retracted requests due to kvcache full.",
|
230
|
+
self.num_paused_reqs = Gauge(
|
231
|
+
name="sglang:num_paused_reqs",
|
232
|
+
documentation="The number of paused requests by async weight sync.",
|
226
233
|
labelnames=labels.keys(),
|
227
|
-
multiprocess_mode="mostrecent",
|
228
234
|
)
|
229
235
|
|
230
|
-
#
|
236
|
+
# PD disaggregation
|
231
237
|
self.num_prefill_prealloc_queue_reqs = Gauge(
|
232
238
|
name="sglang:num_prefill_prealloc_queue_reqs",
|
233
239
|
documentation="The number of requests in the prefill prealloc queue.",
|
234
240
|
labelnames=labels.keys(),
|
235
241
|
multiprocess_mode="mostrecent",
|
236
242
|
)
|
237
|
-
|
238
243
|
self.num_prefill_inflight_queue_reqs = Gauge(
|
239
244
|
name="sglang:num_prefill_inflight_queue_reqs",
|
240
245
|
documentation="The number of requests in the prefill inflight queue.",
|
241
246
|
labelnames=labels.keys(),
|
242
247
|
multiprocess_mode="mostrecent",
|
243
248
|
)
|
244
|
-
|
245
249
|
self.num_decode_prealloc_queue_reqs = Gauge(
|
246
250
|
name="sglang:num_decode_prealloc_queue_reqs",
|
247
251
|
documentation="The number of requests in the decode prealloc queue.",
|
248
252
|
labelnames=labels.keys(),
|
249
253
|
multiprocess_mode="mostrecent",
|
250
254
|
)
|
251
|
-
|
252
255
|
self.num_decode_transfer_queue_reqs = Gauge(
|
253
256
|
name="sglang:num_decode_transfer_queue_reqs",
|
254
257
|
documentation="The number of requests in the decode transfer queue.",
|
255
258
|
labelnames=labels.keys(),
|
256
259
|
multiprocess_mode="mostrecent",
|
257
260
|
)
|
258
|
-
|
259
261
|
self.num_bootstrap_failed_reqs = Counter(
|
260
|
-
name="sglang:
|
262
|
+
name="sglang:num_bootstrap_failed_reqs_total",
|
261
263
|
documentation="The number of bootstrap failed requests.",
|
262
264
|
labelnames=labels.keys(),
|
263
265
|
)
|
264
|
-
|
265
266
|
self.num_transfer_failed_reqs = Counter(
|
266
|
-
name="sglang:
|
267
|
+
name="sglang:num_transfer_failed_reqs_total",
|
267
268
|
documentation="The number of transfer failed requests.",
|
268
269
|
labelnames=labels.keys(),
|
269
270
|
)
|
271
|
+
self.kv_transfer_speed_gb_s = Gauge(
|
272
|
+
name="sglang:kv_transfer_speed_gb_s",
|
273
|
+
documentation="The transfer speed of the KV cache in GB/s.",
|
274
|
+
labelnames=labels.keys(),
|
275
|
+
multiprocess_mode="mostrecent",
|
276
|
+
)
|
277
|
+
self.kv_transfer_latency_ms = Gauge(
|
278
|
+
name="sglang:kv_transfer_latency_ms",
|
279
|
+
documentation="The transfer latency of the KV cache in ms.",
|
280
|
+
labelnames=labels.keys(),
|
281
|
+
multiprocess_mode="mostrecent",
|
282
|
+
)
|
283
|
+
|
284
|
+
# Utilization
|
285
|
+
self.utilization = Gauge(
|
286
|
+
name="sglang:utilization",
|
287
|
+
documentation="The utilization.",
|
288
|
+
labelnames=labels.keys(),
|
289
|
+
multiprocess_mode="mostrecent",
|
290
|
+
)
|
291
|
+
self.max_running_requests_under_SLO = Gauge(
|
292
|
+
name="sglang:max_running_requests_under_SLO",
|
293
|
+
documentation="The maximum number of running requests under SLO.",
|
294
|
+
labelnames=labels.keys(),
|
295
|
+
multiprocess_mode="mostrecent",
|
296
|
+
)
|
297
|
+
|
298
|
+
# Engine startup
|
299
|
+
self.engine_startup_time = Gauge(
|
300
|
+
name="sglang:engine_startup_time",
|
301
|
+
documentation="The time taken for the engine to start up.",
|
302
|
+
labelnames=labels.keys(),
|
303
|
+
multiprocess_mode="mostrecent",
|
304
|
+
)
|
305
|
+
self.engine_load_weights_time = Gauge(
|
306
|
+
name="sglang:engine_load_weights_time",
|
307
|
+
documentation="The time taken for the engine to load weights.",
|
308
|
+
labelnames=labels.keys(),
|
309
|
+
multiprocess_mode="mostrecent",
|
310
|
+
)
|
311
|
+
|
312
|
+
# Additional queueing time histogram
|
313
|
+
self.queue_time = Histogram(
|
314
|
+
name="sglang:queue_time_seconds",
|
315
|
+
documentation="Histogram of queueing time in seconds.",
|
316
|
+
labelnames=labels.keys(),
|
317
|
+
buckets=[
|
318
|
+
0.0,
|
319
|
+
0.1,
|
320
|
+
0.2,
|
321
|
+
0.5,
|
322
|
+
1,
|
323
|
+
2,
|
324
|
+
3,
|
325
|
+
4,
|
326
|
+
5,
|
327
|
+
10,
|
328
|
+
15,
|
329
|
+
20,
|
330
|
+
30,
|
331
|
+
40,
|
332
|
+
50,
|
333
|
+
60,
|
334
|
+
70,
|
335
|
+
80,
|
336
|
+
90,
|
337
|
+
100,
|
338
|
+
200,
|
339
|
+
300,
|
340
|
+
400,
|
341
|
+
500,
|
342
|
+
600,
|
343
|
+
700,
|
344
|
+
800,
|
345
|
+
900,
|
346
|
+
1000,
|
347
|
+
1200,
|
348
|
+
1400,
|
349
|
+
1600,
|
350
|
+
1800,
|
351
|
+
2000,
|
352
|
+
2500,
|
353
|
+
3000,
|
354
|
+
],
|
355
|
+
)
|
356
|
+
|
357
|
+
# Grammar metrics
|
358
|
+
self.grammar_compilation_time = Histogram(
|
359
|
+
name="sglang:grammar_compilation_time_seconds",
|
360
|
+
documentation="Histogram of grammar compilation time in seconds.",
|
361
|
+
labelnames=labels.keys(),
|
362
|
+
buckets=[
|
363
|
+
0.0,
|
364
|
+
0.01,
|
365
|
+
0.02,
|
366
|
+
0.05,
|
367
|
+
0.1,
|
368
|
+
0.2,
|
369
|
+
0.5,
|
370
|
+
1,
|
371
|
+
2,
|
372
|
+
5,
|
373
|
+
10,
|
374
|
+
20,
|
375
|
+
30,
|
376
|
+
60,
|
377
|
+
90,
|
378
|
+
120,
|
379
|
+
240,
|
380
|
+
],
|
381
|
+
)
|
382
|
+
self.num_grammar_cache_hit = Counter(
|
383
|
+
name="sglang:num_grammar_cache_hit_total",
|
384
|
+
documentation="Number of grammar cache hits.",
|
385
|
+
labelnames=labels.keys(),
|
386
|
+
)
|
387
|
+
self.num_grammar_aborted = Counter(
|
388
|
+
name="sglang:num_grammar_aborted_total",
|
389
|
+
documentation="Number of grammar aborted requests.",
|
390
|
+
labelnames=labels.keys(),
|
391
|
+
)
|
392
|
+
self.num_grammar_total = Counter(
|
393
|
+
name="sglang:num_grammar_total",
|
394
|
+
documentation="Number of the total grammar requests.",
|
395
|
+
labelnames=labels.keys(),
|
396
|
+
)
|
397
|
+
self.grammar_schema_count = Histogram(
|
398
|
+
name="sglang:grammar_schema_count",
|
399
|
+
documentation="Histogram of grammar schema count.",
|
400
|
+
labelnames=labels.keys(),
|
401
|
+
buckets=[
|
402
|
+
0,
|
403
|
+
1,
|
404
|
+
2,
|
405
|
+
5,
|
406
|
+
10,
|
407
|
+
20,
|
408
|
+
30,
|
409
|
+
40,
|
410
|
+
60,
|
411
|
+
80,
|
412
|
+
100,
|
413
|
+
120,
|
414
|
+
140,
|
415
|
+
160,
|
416
|
+
180,
|
417
|
+
200,
|
418
|
+
300,
|
419
|
+
400,
|
420
|
+
500,
|
421
|
+
700,
|
422
|
+
1000,
|
423
|
+
],
|
424
|
+
)
|
425
|
+
self.grammar_ebnf_size = Histogram(
|
426
|
+
name="sglang:grammar_ebnf_size",
|
427
|
+
documentation="Histogram of grammar EBNF size.",
|
428
|
+
labelnames=labels.keys(),
|
429
|
+
buckets=[
|
430
|
+
0,
|
431
|
+
50,
|
432
|
+
100,
|
433
|
+
200,
|
434
|
+
300,
|
435
|
+
500,
|
436
|
+
1000,
|
437
|
+
2000,
|
438
|
+
3000,
|
439
|
+
5000,
|
440
|
+
10000,
|
441
|
+
20000,
|
442
|
+
30000,
|
443
|
+
50000,
|
444
|
+
100000,
|
445
|
+
],
|
446
|
+
)
|
447
|
+
|
448
|
+
tree_traversal_time_buckets = [
|
449
|
+
0.0,
|
450
|
+
0.01,
|
451
|
+
0.02,
|
452
|
+
0.05,
|
453
|
+
0.1,
|
454
|
+
0.2,
|
455
|
+
0.5,
|
456
|
+
1,
|
457
|
+
2,
|
458
|
+
5,
|
459
|
+
10,
|
460
|
+
15,
|
461
|
+
30,
|
462
|
+
60,
|
463
|
+
90,
|
464
|
+
120,
|
465
|
+
240,
|
466
|
+
]
|
467
|
+
self.grammar_tree_traversal_time_avg = Histogram(
|
468
|
+
name="sglang:grammar_tree_traversal_time_avg",
|
469
|
+
documentation="Histogram of average grammar tree traversal time in seconds.",
|
470
|
+
labelnames=labels.keys(),
|
471
|
+
buckets=tree_traversal_time_buckets,
|
472
|
+
)
|
473
|
+
self.grammar_tree_traversal_time_max = Histogram(
|
474
|
+
name="sglang:grammar_tree_traversal_time_max",
|
475
|
+
documentation="Histogram of max grammar tree traversal time in seconds.",
|
476
|
+
labelnames=labels.keys(),
|
477
|
+
buckets=tree_traversal_time_buckets,
|
478
|
+
)
|
479
|
+
|
480
|
+
self.per_stage_req_latency_seconds = Histogram(
|
481
|
+
name="sglang:per_stage_req_latency_seconds",
|
482
|
+
documentation="The latency of each stage of requests.",
|
483
|
+
# captures latency in range [1ms - ~1191s]
|
484
|
+
buckets=exponential_buckets(start=0.001, width=1.62, length=30),
|
485
|
+
labelnames=list(labels.keys()) + ["stage"],
|
486
|
+
)
|
270
487
|
|
271
488
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
272
489
|
# Convenience function for logging to gauge.
|
273
490
|
gauge.labels(**self.labels).set(data)
|
274
491
|
|
492
|
+
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
|
493
|
+
histogram.labels(**self.labels).observe(data)
|
494
|
+
|
275
495
|
def increment_bootstrap_failed_reqs(self) -> None:
|
276
496
|
self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
|
277
497
|
|
278
498
|
def increment_transfer_failed_reqs(self) -> None:
|
279
499
|
self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
|
280
500
|
|
501
|
+
def observe_per_stage_req_latency(self, stage: str, latency: float) -> None:
|
502
|
+
labels_with_stage = {**self.labels, "stage": stage}
|
503
|
+
self.per_stage_req_latency_seconds.labels(**labels_with_stage).observe(latency)
|
504
|
+
|
505
|
+
def observe_queue_time(self, latency: float) -> None:
|
506
|
+
self._log_histogram(self.queue_time, latency)
|
507
|
+
|
281
508
|
def log_stats(self, stats: SchedulerStats) -> None:
|
282
509
|
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
|
283
510
|
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
|
284
511
|
self._log_gauge(self.token_usage, stats.token_usage)
|
512
|
+
self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
|
285
513
|
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
286
514
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
287
515
|
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
516
|
+
self._log_gauge(
|
517
|
+
self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
|
518
|
+
)
|
288
519
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
520
|
+
|
521
|
+
# Speculative decoding
|
289
522
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
290
|
-
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
291
523
|
|
292
|
-
#
|
524
|
+
# PD disaggregation
|
293
525
|
self._log_gauge(
|
294
526
|
self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
|
295
527
|
)
|
@@ -302,14 +534,58 @@ class SchedulerMetricsCollector:
|
|
302
534
|
self._log_gauge(
|
303
535
|
self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
|
304
536
|
)
|
537
|
+
self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
|
538
|
+
self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
|
539
|
+
|
540
|
+
# Retract
|
541
|
+
self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
|
542
|
+
self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
|
543
|
+
|
544
|
+
# Utilization
|
545
|
+
self._log_gauge(self.utilization, stats.utilization)
|
546
|
+
if stats.max_running_requests_under_SLO is not None:
|
547
|
+
self._log_gauge(
|
548
|
+
self.max_running_requests_under_SLO,
|
549
|
+
stats.max_running_requests_under_SLO,
|
550
|
+
)
|
551
|
+
|
552
|
+
# Engine startup time
|
553
|
+
self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
|
554
|
+
if stats.engine_load_weights_time is not None:
|
555
|
+
self._log_gauge(
|
556
|
+
self.engine_load_weights_time, stats.engine_load_weights_time
|
557
|
+
)
|
305
558
|
|
306
559
|
self.last_log_time = time.perf_counter()
|
307
560
|
|
561
|
+
def log_grammar_stats(self, grammar_stats) -> None:
|
562
|
+
# Duck-typed GrammarStats to avoid cross-package dependency
|
563
|
+
if getattr(grammar_stats, "compilation_time", None) is not None:
|
564
|
+
self._log_histogram(
|
565
|
+
self.grammar_compilation_time, grammar_stats.compilation_time
|
566
|
+
)
|
567
|
+
if getattr(grammar_stats, "schema_count", None) is not None:
|
568
|
+
self._log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
|
569
|
+
if getattr(grammar_stats, "ebnf_size", None) is not None:
|
570
|
+
self._log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
|
571
|
+
tree_times = getattr(grammar_stats, "tree_traversal_time", None)
|
572
|
+
if tree_times:
|
573
|
+
max_time = max(tree_times)
|
574
|
+
avg_time = sum(tree_times) / len(tree_times)
|
575
|
+
self._log_histogram(self.grammar_tree_traversal_time_max, max_time)
|
576
|
+
self._log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
|
577
|
+
if getattr(grammar_stats, "is_cache_hit", False):
|
578
|
+
self.num_grammar_cache_hit.labels(**self.labels).inc(1)
|
579
|
+
if getattr(grammar_stats, "is_grammar_aborted", False):
|
580
|
+
self.num_grammar_aborted.labels(**self.labels).inc(1)
|
581
|
+
self.num_grammar_total.labels(**self.labels).inc(1)
|
582
|
+
|
308
583
|
|
309
584
|
class TokenizerMetricsCollector:
|
310
585
|
def __init__(
|
311
586
|
self,
|
312
|
-
|
587
|
+
server_args: Optional[ServerArgs] = None,
|
588
|
+
labels: Dict[str, str] = None,
|
313
589
|
bucket_time_to_first_token: Optional[List[float]] = None,
|
314
590
|
bucket_inter_token_latency: Optional[List[float]] = None,
|
315
591
|
bucket_e2e_request_latency: Optional[List[float]] = None,
|
@@ -318,7 +594,7 @@ class TokenizerMetricsCollector:
|
|
318
594
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
319
595
|
from prometheus_client import Counter, Histogram
|
320
596
|
|
321
|
-
self.labels = labels
|
597
|
+
self.labels = labels or {}
|
322
598
|
self.collect_tokens_histogram = collect_tokens_histogram
|
323
599
|
|
324
600
|
self.prompt_tokens_total = Counter(
|
@@ -334,7 +610,7 @@ class TokenizerMetricsCollector:
|
|
334
610
|
)
|
335
611
|
|
336
612
|
if collect_tokens_histogram:
|
337
|
-
|
613
|
+
default_bucket_prompt_tokens = [
|
338
614
|
100,
|
339
615
|
300,
|
340
616
|
500,
|
@@ -358,39 +634,30 @@ class TokenizerMetricsCollector:
|
|
358
634
|
30000,
|
359
635
|
35000,
|
360
636
|
40000,
|
637
|
+
66000,
|
638
|
+
99000,
|
639
|
+
132000,
|
640
|
+
300000,
|
641
|
+
600000,
|
642
|
+
900000,
|
643
|
+
1100000,
|
361
644
|
]
|
362
645
|
self.prompt_tokens_histogram = Histogram(
|
363
646
|
name="sglang:prompt_tokens_histogram",
|
364
647
|
documentation="Histogram of prompt token length.",
|
365
648
|
labelnames=labels.keys(),
|
366
|
-
buckets=
|
649
|
+
buckets=generate_buckets(
|
650
|
+
server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
|
651
|
+
),
|
367
652
|
)
|
368
|
-
bucket_generation_tokens = [
|
369
|
-
100,
|
370
|
-
300,
|
371
|
-
500,
|
372
|
-
1000,
|
373
|
-
1200,
|
374
|
-
1500,
|
375
|
-
1700,
|
376
|
-
2000,
|
377
|
-
2500,
|
378
|
-
3000,
|
379
|
-
3500,
|
380
|
-
4000,
|
381
|
-
4500,
|
382
|
-
5000,
|
383
|
-
6000,
|
384
|
-
7000,
|
385
|
-
8000,
|
386
|
-
9000,
|
387
|
-
10000,
|
388
|
-
]
|
389
653
|
self.generation_tokens_histogram = Histogram(
|
390
654
|
name="sglang:generation_tokens_histogram",
|
391
655
|
documentation="Histogram of generation token length.",
|
392
656
|
labelnames=labels.keys(),
|
393
|
-
buckets=
|
657
|
+
buckets=generate_buckets(
|
658
|
+
server_args.generation_tokens_buckets,
|
659
|
+
default_bucket_prompt_tokens,
|
660
|
+
),
|
394
661
|
)
|
395
662
|
|
396
663
|
self.cached_tokens_total = Counter(
|
@@ -412,7 +679,7 @@ class TokenizerMetricsCollector:
|
|
412
679
|
)
|
413
680
|
|
414
681
|
self.num_aborted_requests_total = Counter(
|
415
|
-
name="sglang:
|
682
|
+
name="sglang:num_aborted_requests_total",
|
416
683
|
documentation="Number of requests aborted.",
|
417
684
|
labelnames=labels.keys(),
|
418
685
|
)
|
@@ -459,7 +726,10 @@ class TokenizerMetricsCollector:
|
|
459
726
|
100,
|
460
727
|
200,
|
461
728
|
400,
|
462
|
-
|
729
|
+
600,
|
730
|
+
1200,
|
731
|
+
1800,
|
732
|
+
2400,
|
463
733
|
]
|
464
734
|
|
465
735
|
if bucket_inter_token_latency is None:
|
@@ -496,7 +766,7 @@ class TokenizerMetricsCollector:
|
|
496
766
|
buckets=bucket_time_to_first_token,
|
497
767
|
)
|
498
768
|
|
499
|
-
self.
|
769
|
+
self.histogram_inter_token_latency = Histogram(
|
500
770
|
name="sglang:inter_token_latency_seconds",
|
501
771
|
documentation="Histogram of inter-token latency in seconds.",
|
502
772
|
labelnames=labels.keys(),
|
@@ -510,38 +780,53 @@ class TokenizerMetricsCollector:
|
|
510
780
|
buckets=bucket_e2e_request_latency,
|
511
781
|
)
|
512
782
|
|
513
|
-
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
|
514
|
-
histogram.labels(**self.labels).observe(data)
|
515
|
-
|
516
783
|
def observe_one_finished_request(
|
517
784
|
self,
|
785
|
+
labels: Dict[str, str],
|
518
786
|
prompt_tokens: int,
|
519
787
|
generation_tokens: int,
|
520
788
|
cached_tokens: int,
|
521
789
|
e2e_latency: float,
|
522
790
|
has_grammar: bool,
|
523
791
|
):
|
524
|
-
self.prompt_tokens_total.labels(**
|
525
|
-
self.generation_tokens_total.labels(**
|
792
|
+
self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
|
793
|
+
self.generation_tokens_total.labels(**labels).inc(generation_tokens)
|
526
794
|
if cached_tokens > 0:
|
527
|
-
self.cached_tokens_total.labels(**
|
528
|
-
self.num_requests_total.labels(**
|
795
|
+
self.cached_tokens_total.labels(**labels).inc(cached_tokens)
|
796
|
+
self.num_requests_total.labels(**labels).inc(1)
|
529
797
|
if has_grammar:
|
530
|
-
self.num_so_requests_total.labels(**
|
531
|
-
self.
|
798
|
+
self.num_so_requests_total.labels(**labels).inc(1)
|
799
|
+
self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
|
532
800
|
if self.collect_tokens_histogram:
|
533
|
-
self.
|
534
|
-
self.
|
535
|
-
|
536
|
-
|
537
|
-
self.histogram_time_to_first_token.labels(**self.labels).observe(value)
|
801
|
+
self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
|
802
|
+
self.generation_tokens_histogram.labels(**labels).observe(
|
803
|
+
float(generation_tokens)
|
804
|
+
)
|
538
805
|
|
539
|
-
def
|
806
|
+
def observe_time_to_first_token(self, labels: Dict[str, str], value: float):
|
807
|
+
self.histogram_time_to_first_token.labels(**labels).observe(value)
|
808
|
+
|
809
|
+
def check_time_to_first_token_straggler(self, value: float) -> bool:
|
810
|
+
his = self.histogram_time_to_first_token.labels(**self.labels)
|
811
|
+
total_observations = sum(bucket._value for bucket in his._buckets)
|
812
|
+
if total_observations < 1000:
|
813
|
+
return False
|
814
|
+
p999_threshold = total_observations * 0.999
|
815
|
+
cumulative_count = 0
|
816
|
+
for i, bucket in enumerate(his._buckets):
|
817
|
+
cumulative_count += bucket._value
|
818
|
+
if cumulative_count > p999_threshold:
|
819
|
+
return value >= his._upper_bounds[i]
|
820
|
+
return False
|
821
|
+
|
822
|
+
def observe_inter_token_latency(
|
823
|
+
self, labels: Dict[str, str], internval: float, num_new_tokens: int
|
824
|
+
):
|
540
825
|
adjusted_interval = internval / num_new_tokens
|
541
826
|
|
542
827
|
# A faster version of the Histogram::observe which observes multiple values at the same time.
|
543
828
|
# reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
|
544
|
-
his = self.
|
829
|
+
his = self.histogram_inter_token_latency.labels(**labels)
|
545
830
|
his._sum.inc(internval)
|
546
831
|
|
547
832
|
for i, bound in enumerate(his._upper_bounds):
|
@@ -549,5 +834,107 @@ class TokenizerMetricsCollector:
|
|
549
834
|
his._buckets[i].inc(num_new_tokens)
|
550
835
|
break
|
551
836
|
|
552
|
-
def observe_one_aborted_request(self):
|
553
|
-
self.num_aborted_requests_total.labels(**
|
837
|
+
def observe_one_aborted_request(self, labels: Dict[str, str]):
|
838
|
+
self.num_aborted_requests_total.labels(**labels).inc(1)
|
839
|
+
|
840
|
+
|
841
|
+
@dataclass
|
842
|
+
class StorageMetrics:
|
843
|
+
prefetch_pgs: List[int] = field(default_factory=list)
|
844
|
+
backup_pgs: List[int] = field(default_factory=list)
|
845
|
+
prefetch_bandwidth: List[float] = field(default_factory=list)
|
846
|
+
backup_bandwidth: List[float] = field(default_factory=list)
|
847
|
+
|
848
|
+
|
849
|
+
class StorageMetricsCollector:
|
850
|
+
def __init__(
|
851
|
+
self,
|
852
|
+
labels: Dict[str, str],
|
853
|
+
):
|
854
|
+
from prometheus_client import Counter, Histogram
|
855
|
+
|
856
|
+
self.labels = labels
|
857
|
+
|
858
|
+
self.prefetched_tokens_total = Counter(
|
859
|
+
name="sglang:prefetched_tokens_total",
|
860
|
+
documentation="Number of prefetched prompt tokens.",
|
861
|
+
labelnames=labels.keys(),
|
862
|
+
)
|
863
|
+
|
864
|
+
self.backuped_tokens_total = Counter(
|
865
|
+
name="sglang:backuped_tokens_total",
|
866
|
+
documentation="Number of backuped tokens.",
|
867
|
+
labelnames=labels.keys(),
|
868
|
+
)
|
869
|
+
|
870
|
+
bucket_io = [
|
871
|
+
1,
|
872
|
+
5,
|
873
|
+
10,
|
874
|
+
50,
|
875
|
+
100,
|
876
|
+
]
|
877
|
+
|
878
|
+
bucket_bandwidth = [
|
879
|
+
0.1,
|
880
|
+
0.5,
|
881
|
+
1,
|
882
|
+
5,
|
883
|
+
10,
|
884
|
+
50,
|
885
|
+
100,
|
886
|
+
]
|
887
|
+
|
888
|
+
self.histogram_prefetch_pgs = Histogram(
|
889
|
+
name="sglang:prefetch_pgs",
|
890
|
+
documentation="Histogram of prefetch pages of batches.",
|
891
|
+
labelnames=labels.keys(),
|
892
|
+
buckets=bucket_io,
|
893
|
+
)
|
894
|
+
|
895
|
+
self.histogram_backup_pgs = Histogram(
|
896
|
+
name="sglang:backup_pgs",
|
897
|
+
documentation="Histogram of backup pages of batches.",
|
898
|
+
labelnames=labels.keys(),
|
899
|
+
buckets=bucket_io,
|
900
|
+
)
|
901
|
+
|
902
|
+
self.histogram_prefetch_bandwidth = Histogram(
|
903
|
+
name="sglang:prefetch_bandwidth",
|
904
|
+
documentation="Histogram of prefetch bandwidth in GB/s.",
|
905
|
+
labelnames=labels.keys(),
|
906
|
+
buckets=bucket_bandwidth,
|
907
|
+
)
|
908
|
+
|
909
|
+
self.histogram_backup_bandwidth = Histogram(
|
910
|
+
name="sglang:backup_bandwidth",
|
911
|
+
documentation="Histogram of backup bandwidth in GB/s.",
|
912
|
+
labelnames=labels.keys(),
|
913
|
+
buckets=bucket_bandwidth,
|
914
|
+
)
|
915
|
+
|
916
|
+
def log_prefetched_tokens(self, prefetched_tokens: int):
|
917
|
+
if prefetched_tokens > 0:
|
918
|
+
self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
|
919
|
+
|
920
|
+
def log_backuped_tokens(self, backuped_tokens: int):
|
921
|
+
if backuped_tokens > 0:
|
922
|
+
self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
|
923
|
+
|
924
|
+
def _log_histogram(self, histogram, data: Union[int, float]):
|
925
|
+
histogram.labels(**self.labels).observe(data)
|
926
|
+
|
927
|
+
def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
|
928
|
+
if storage_metrics is None:
|
929
|
+
return
|
930
|
+
|
931
|
+
assert isinstance(storage_metrics, StorageMetrics)
|
932
|
+
|
933
|
+
for v in storage_metrics.prefetch_pgs:
|
934
|
+
self._log_histogram(self.histogram_prefetch_pgs, v)
|
935
|
+
for v in storage_metrics.backup_pgs:
|
936
|
+
self._log_histogram(self.histogram_backup_pgs, v)
|
937
|
+
for v in storage_metrics.prefetch_bandwidth:
|
938
|
+
self._log_histogram(self.histogram_prefetch_bandwidth, v)
|
939
|
+
for v in storage_metrics.backup_bandwidth:
|
940
|
+
self._log_histogram(self.histogram_backup_bandwidth, v)
|