sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
sglang/srt/metrics/collector.py
CHANGED
@@ -12,13 +12,12 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
"""Utilities for Prometheus Metrics Collection."""
|
15
|
-
|
16
15
|
import time
|
17
|
-
from dataclasses import dataclass
|
18
|
-
from enum import Enum
|
16
|
+
from dataclasses import dataclass, field
|
19
17
|
from typing import Dict, List, Optional, Union
|
20
18
|
|
21
|
-
from sglang.srt.
|
19
|
+
from sglang.srt.disaggregation.utils import DisaggregationMode
|
20
|
+
from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
|
22
21
|
from sglang.srt.server_args import ServerArgs
|
23
22
|
from sglang.srt.utils import get_bool_env_var
|
24
23
|
|
@@ -35,6 +34,7 @@ class TimeStats:
|
|
35
34
|
Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
|
36
35
|
"""
|
37
36
|
|
37
|
+
disagg_mode: DisaggregationMode = DisaggregationMode.NULL
|
38
38
|
lb_entry_time: float = 0.0
|
39
39
|
wait_queue_entry_time: float = 0.0
|
40
40
|
forward_entry_time: float = 0.0
|
@@ -44,17 +44,11 @@ class TimeStats:
|
|
44
44
|
decode_prealloc_queue_entry_time: float = 0.0
|
45
45
|
decode_transfer_queue_entry_time: float = 0.0
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
PREFILL = "prefill"
|
50
|
-
DECODE = "decode"
|
51
|
-
INVALID = "invalid"
|
52
|
-
|
53
|
-
def __str__(self) -> str:
|
54
|
-
# if unified
|
55
|
-
_type = self.get_type()
|
47
|
+
def get_queueing_time(self) -> float:
|
48
|
+
return self.forward_entry_time - self.wait_queue_entry_time
|
56
49
|
|
57
|
-
|
50
|
+
def convert_to_duration(self) -> str:
|
51
|
+
if self.disagg_mode == DisaggregationMode.NULL:
|
58
52
|
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
59
53
|
forward_duration = self.completion_time - self.forward_entry_time
|
60
54
|
|
@@ -63,30 +57,28 @@ class TimeStats:
|
|
63
57
|
queue_duration >= 0 and forward_duration >= 0
|
64
58
|
), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
65
59
|
|
66
|
-
return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
|
67
|
-
elif
|
60
|
+
return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time:.3f}"
|
61
|
+
elif self.disagg_mode == DisaggregationMode.PREFILL:
|
68
62
|
bootstrap_duration = (
|
69
63
|
self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
|
70
64
|
)
|
71
|
-
|
72
65
|
queue_duration = self.forward_entry_time - self.wait_queue_entry_time
|
73
|
-
|
74
66
|
forward_duration = self.completion_time - self.forward_entry_time
|
75
67
|
|
76
68
|
if SGLANG_TEST_REQUEST_TIME_STATS:
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
69
|
+
if self.wait_queue_entry_time > 0:
|
70
|
+
assert (
|
71
|
+
bootstrap_duration >= 0
|
72
|
+
and queue_duration >= 0
|
73
|
+
and forward_duration >= 0
|
74
|
+
), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
|
75
|
+
|
76
|
+
return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time:.3f}"
|
77
|
+
elif self.disagg_mode == DisaggregationMode.DECODE:
|
85
78
|
prealloc_duration = (
|
86
79
|
self.decode_transfer_queue_entry_time
|
87
80
|
- self.decode_prealloc_queue_entry_time
|
88
81
|
)
|
89
|
-
|
90
82
|
transfer_duration = (
|
91
83
|
self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
|
92
84
|
)
|
@@ -94,67 +86,74 @@ class TimeStats:
|
|
94
86
|
forward_duration = self.completion_time - self.forward_entry_time
|
95
87
|
|
96
88
|
if SGLANG_TEST_REQUEST_TIME_STATS:
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
89
|
+
if self.wait_queue_entry_time > 0:
|
90
|
+
assert (
|
91
|
+
prealloc_duration >= 0
|
92
|
+
and transfer_duration >= 0
|
93
|
+
and queue_duration >= 0
|
94
|
+
and forward_duration >= 0
|
95
|
+
), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0. {self=}"
|
96
|
+
|
97
|
+
return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time:.3f}"
|
105
98
|
else:
|
106
|
-
return "
|
99
|
+
return "Unknown Time Stats"
|
107
100
|
|
108
101
|
def format_duration(self, duration: float) -> str:
|
109
102
|
return f"{duration * 1e3:.2f}ms"
|
110
103
|
|
111
|
-
def
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
):
|
119
|
-
return self.RequestType.UNIFIED
|
120
|
-
elif (
|
121
|
-
self.prefill_bootstrap_queue_entry_time > 0.0
|
122
|
-
and self.prefill_transfer_queue_entry_time > 0.0
|
123
|
-
):
|
124
|
-
return self.RequestType.PREFILL
|
125
|
-
elif (
|
126
|
-
self.decode_prealloc_queue_entry_time > 0.0
|
127
|
-
and self.decode_transfer_queue_entry_time > 0.0
|
128
|
-
and self.wait_queue_entry_time > 0.0
|
129
|
-
):
|
130
|
-
return self.RequestType.DECODE
|
104
|
+
def disagg_mode_str(self) -> str:
|
105
|
+
if self.disagg_mode == DisaggregationMode.NULL:
|
106
|
+
return "unified"
|
107
|
+
elif self.disagg_mode == DisaggregationMode.DECODE:
|
108
|
+
return "decode"
|
109
|
+
elif self.disagg_mode == DisaggregationMode.PREFILL:
|
110
|
+
return "prefill"
|
131
111
|
else:
|
132
|
-
return
|
112
|
+
return "unknown"
|
133
113
|
|
134
114
|
|
135
115
|
@dataclass
|
136
116
|
class SchedulerStats:
|
117
|
+
# Basics
|
137
118
|
num_running_reqs: int = 0
|
138
119
|
num_used_tokens: int = 0
|
139
120
|
token_usage: float = 0.0
|
121
|
+
swa_token_usage: float = 0.0
|
140
122
|
gen_throughput: float = 0.0
|
141
123
|
num_queue_reqs: int = 0
|
142
|
-
cache_hit_rate: float = 0.0
|
143
124
|
num_grammar_queue_reqs: int = 0
|
125
|
+
num_running_reqs_offline_batch: int = 0
|
126
|
+
cache_hit_rate: float = 0.0
|
127
|
+
|
128
|
+
# Speculative decoding
|
144
129
|
spec_accept_length: float = 0.0
|
145
|
-
|
130
|
+
|
131
|
+
# Retract
|
132
|
+
num_retracted_reqs: int = 0
|
133
|
+
num_paused_reqs: int = 0
|
134
|
+
|
135
|
+
# PD disaggregation
|
146
136
|
num_prefill_prealloc_queue_reqs: int = 0
|
147
137
|
num_prefill_inflight_queue_reqs: int = 0
|
148
138
|
num_decode_prealloc_queue_reqs: int = 0
|
149
139
|
num_decode_transfer_queue_reqs: int = 0
|
150
|
-
|
140
|
+
kv_transfer_speed_gb_s: float = 0.0
|
141
|
+
kv_transfer_latency_ms: float = 0.0
|
142
|
+
|
143
|
+
# Utilization
|
144
|
+
utilization: float = 0.0
|
145
|
+
max_running_requests_under_SLO: Optional[int] = None
|
146
|
+
|
147
|
+
# Engine startup
|
148
|
+
engine_startup_time: float = 0.0
|
149
|
+
engine_load_weights_time: float = 0.0
|
151
150
|
|
152
151
|
|
153
152
|
class SchedulerMetricsCollector:
|
154
153
|
|
155
154
|
def __init__(self, labels: Dict[str, str]) -> None:
|
156
155
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
157
|
-
from prometheus_client import Counter, Gauge
|
156
|
+
from prometheus_client import Counter, Gauge, Histogram
|
158
157
|
|
159
158
|
self.labels = labels
|
160
159
|
self.last_log_time = time.perf_counter()
|
@@ -165,42 +164,48 @@ class SchedulerMetricsCollector:
|
|
165
164
|
labelnames=labels.keys(),
|
166
165
|
multiprocess_mode="mostrecent",
|
167
166
|
)
|
168
|
-
|
169
167
|
self.num_used_tokens = Gauge(
|
170
168
|
name="sglang:num_used_tokens",
|
171
169
|
documentation="The number of used tokens.",
|
172
170
|
labelnames=labels.keys(),
|
173
171
|
multiprocess_mode="mostrecent",
|
174
172
|
)
|
175
|
-
|
176
173
|
self.token_usage = Gauge(
|
177
174
|
name="sglang:token_usage",
|
178
175
|
documentation="The token usage.",
|
179
176
|
labelnames=labels.keys(),
|
180
177
|
multiprocess_mode="mostrecent",
|
181
178
|
)
|
182
|
-
|
179
|
+
self.swa_token_usage = Gauge(
|
180
|
+
name="sglang:swa_token_usage",
|
181
|
+
documentation="The token usage for SWA layers.",
|
182
|
+
labelnames=labels.keys(),
|
183
|
+
multiprocess_mode="mostrecent",
|
184
|
+
)
|
183
185
|
self.gen_throughput = Gauge(
|
184
186
|
name="sglang:gen_throughput",
|
185
187
|
documentation="The generation throughput (token/s).",
|
186
188
|
labelnames=labels.keys(),
|
187
189
|
multiprocess_mode="mostrecent",
|
188
190
|
)
|
189
|
-
|
190
191
|
self.num_queue_reqs = Gauge(
|
191
192
|
name="sglang:num_queue_reqs",
|
192
193
|
documentation="The number of requests in the waiting queue.",
|
193
194
|
labelnames=labels.keys(),
|
194
195
|
multiprocess_mode="mostrecent",
|
195
196
|
)
|
196
|
-
|
197
197
|
self.num_grammar_queue_reqs = Gauge(
|
198
198
|
name="sglang:num_grammar_queue_reqs",
|
199
199
|
documentation="The number of requests in the grammar waiting queue.",
|
200
200
|
labelnames=labels.keys(),
|
201
201
|
multiprocess_mode="mostrecent",
|
202
202
|
)
|
203
|
-
|
203
|
+
self.num_running_reqs_offline_batch = Gauge(
|
204
|
+
name="sglang:num_running_reqs_offline_batch",
|
205
|
+
documentation="The number of running low-priority offline batch requests(label is 'batch').",
|
206
|
+
labelnames=labels.keys(),
|
207
|
+
multiprocess_mode="mostrecent",
|
208
|
+
)
|
204
209
|
self.cache_hit_rate = Gauge(
|
205
210
|
name="sglang:cache_hit_rate",
|
206
211
|
documentation="The prefix cache hit rate.",
|
@@ -208,6 +213,7 @@ class SchedulerMetricsCollector:
|
|
208
213
|
multiprocess_mode="mostrecent",
|
209
214
|
)
|
210
215
|
|
216
|
+
# Speculative decoding
|
211
217
|
self.spec_accept_length = Gauge(
|
212
218
|
name="sglang:spec_accept_length",
|
213
219
|
documentation="The average acceptance length of speculative decoding.",
|
@@ -215,83 +221,307 @@ class SchedulerMetricsCollector:
|
|
215
221
|
multiprocess_mode="mostrecent",
|
216
222
|
)
|
217
223
|
|
218
|
-
|
219
|
-
|
220
|
-
|
224
|
+
# Retract
|
225
|
+
self.num_retracted_reqs = Gauge(
|
226
|
+
name="sglang:num_retracted_reqs",
|
227
|
+
documentation="The number of retracted requests.",
|
221
228
|
labelnames=labels.keys(),
|
222
|
-
multiprocess_mode="mostrecent",
|
223
229
|
)
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
documentation="The total number of retracted requests due to kvcache full.",
|
230
|
+
self.num_paused_reqs = Gauge(
|
231
|
+
name="sglang:num_paused_reqs",
|
232
|
+
documentation="The number of paused requests by async weight sync.",
|
228
233
|
labelnames=labels.keys(),
|
229
|
-
multiprocess_mode="mostrecent",
|
230
234
|
)
|
231
235
|
|
232
|
-
#
|
236
|
+
# PD disaggregation
|
233
237
|
self.num_prefill_prealloc_queue_reqs = Gauge(
|
234
238
|
name="sglang:num_prefill_prealloc_queue_reqs",
|
235
239
|
documentation="The number of requests in the prefill prealloc queue.",
|
236
240
|
labelnames=labels.keys(),
|
237
241
|
multiprocess_mode="mostrecent",
|
238
242
|
)
|
239
|
-
|
240
243
|
self.num_prefill_inflight_queue_reqs = Gauge(
|
241
244
|
name="sglang:num_prefill_inflight_queue_reqs",
|
242
245
|
documentation="The number of requests in the prefill inflight queue.",
|
243
246
|
labelnames=labels.keys(),
|
244
247
|
multiprocess_mode="mostrecent",
|
245
248
|
)
|
246
|
-
|
247
249
|
self.num_decode_prealloc_queue_reqs = Gauge(
|
248
250
|
name="sglang:num_decode_prealloc_queue_reqs",
|
249
251
|
documentation="The number of requests in the decode prealloc queue.",
|
250
252
|
labelnames=labels.keys(),
|
251
253
|
multiprocess_mode="mostrecent",
|
252
254
|
)
|
253
|
-
|
254
255
|
self.num_decode_transfer_queue_reqs = Gauge(
|
255
256
|
name="sglang:num_decode_transfer_queue_reqs",
|
256
257
|
documentation="The number of requests in the decode transfer queue.",
|
257
258
|
labelnames=labels.keys(),
|
258
259
|
multiprocess_mode="mostrecent",
|
259
260
|
)
|
260
|
-
|
261
261
|
self.num_bootstrap_failed_reqs = Counter(
|
262
|
-
name="sglang:
|
262
|
+
name="sglang:num_bootstrap_failed_reqs_total",
|
263
263
|
documentation="The number of bootstrap failed requests.",
|
264
264
|
labelnames=labels.keys(),
|
265
265
|
)
|
266
|
-
|
267
266
|
self.num_transfer_failed_reqs = Counter(
|
268
|
-
name="sglang:
|
267
|
+
name="sglang:num_transfer_failed_reqs_total",
|
269
268
|
documentation="The number of transfer failed requests.",
|
270
269
|
labelnames=labels.keys(),
|
271
270
|
)
|
271
|
+
self.kv_transfer_speed_gb_s = Gauge(
|
272
|
+
name="sglang:kv_transfer_speed_gb_s",
|
273
|
+
documentation="The transfer speed of the KV cache in GB/s.",
|
274
|
+
labelnames=labels.keys(),
|
275
|
+
multiprocess_mode="mostrecent",
|
276
|
+
)
|
277
|
+
self.kv_transfer_latency_ms = Gauge(
|
278
|
+
name="sglang:kv_transfer_latency_ms",
|
279
|
+
documentation="The transfer latency of the KV cache in ms.",
|
280
|
+
labelnames=labels.keys(),
|
281
|
+
multiprocess_mode="mostrecent",
|
282
|
+
)
|
283
|
+
|
284
|
+
# Utilization
|
285
|
+
self.utilization = Gauge(
|
286
|
+
name="sglang:utilization",
|
287
|
+
documentation="The utilization.",
|
288
|
+
labelnames=labels.keys(),
|
289
|
+
multiprocess_mode="mostrecent",
|
290
|
+
)
|
291
|
+
self.max_running_requests_under_SLO = Gauge(
|
292
|
+
name="sglang:max_running_requests_under_SLO",
|
293
|
+
documentation="The maximum number of running requests under SLO.",
|
294
|
+
labelnames=labels.keys(),
|
295
|
+
multiprocess_mode="mostrecent",
|
296
|
+
)
|
297
|
+
|
298
|
+
# Engine startup
|
299
|
+
self.engine_startup_time = Gauge(
|
300
|
+
name="sglang:engine_startup_time",
|
301
|
+
documentation="The time taken for the engine to start up.",
|
302
|
+
labelnames=labels.keys(),
|
303
|
+
multiprocess_mode="mostrecent",
|
304
|
+
)
|
305
|
+
self.engine_load_weights_time = Gauge(
|
306
|
+
name="sglang:engine_load_weights_time",
|
307
|
+
documentation="The time taken for the engine to load weights.",
|
308
|
+
labelnames=labels.keys(),
|
309
|
+
multiprocess_mode="mostrecent",
|
310
|
+
)
|
311
|
+
|
312
|
+
# Additional queueing time histogram
|
313
|
+
self.queue_time = Histogram(
|
314
|
+
name="sglang:queue_time_seconds",
|
315
|
+
documentation="Histogram of queueing time in seconds.",
|
316
|
+
labelnames=labels.keys(),
|
317
|
+
buckets=[
|
318
|
+
0.0,
|
319
|
+
0.1,
|
320
|
+
0.2,
|
321
|
+
0.5,
|
322
|
+
1,
|
323
|
+
2,
|
324
|
+
3,
|
325
|
+
4,
|
326
|
+
5,
|
327
|
+
10,
|
328
|
+
15,
|
329
|
+
20,
|
330
|
+
30,
|
331
|
+
40,
|
332
|
+
50,
|
333
|
+
60,
|
334
|
+
70,
|
335
|
+
80,
|
336
|
+
90,
|
337
|
+
100,
|
338
|
+
200,
|
339
|
+
300,
|
340
|
+
400,
|
341
|
+
500,
|
342
|
+
600,
|
343
|
+
700,
|
344
|
+
800,
|
345
|
+
900,
|
346
|
+
1000,
|
347
|
+
1200,
|
348
|
+
1400,
|
349
|
+
1600,
|
350
|
+
1800,
|
351
|
+
2000,
|
352
|
+
2500,
|
353
|
+
3000,
|
354
|
+
],
|
355
|
+
)
|
356
|
+
|
357
|
+
# Grammar metrics
|
358
|
+
self.grammar_compilation_time = Histogram(
|
359
|
+
name="sglang:grammar_compilation_time_seconds",
|
360
|
+
documentation="Histogram of grammar compilation time in seconds.",
|
361
|
+
labelnames=labels.keys(),
|
362
|
+
buckets=[
|
363
|
+
0.0,
|
364
|
+
0.01,
|
365
|
+
0.02,
|
366
|
+
0.05,
|
367
|
+
0.1,
|
368
|
+
0.2,
|
369
|
+
0.5,
|
370
|
+
1,
|
371
|
+
2,
|
372
|
+
5,
|
373
|
+
10,
|
374
|
+
20,
|
375
|
+
30,
|
376
|
+
60,
|
377
|
+
90,
|
378
|
+
120,
|
379
|
+
240,
|
380
|
+
],
|
381
|
+
)
|
382
|
+
self.num_grammar_cache_hit = Counter(
|
383
|
+
name="sglang:num_grammar_cache_hit_total",
|
384
|
+
documentation="Number of grammar cache hits.",
|
385
|
+
labelnames=labels.keys(),
|
386
|
+
)
|
387
|
+
self.num_grammar_aborted = Counter(
|
388
|
+
name="sglang:num_grammar_aborted_total",
|
389
|
+
documentation="Number of grammar aborted requests.",
|
390
|
+
labelnames=labels.keys(),
|
391
|
+
)
|
392
|
+
self.num_grammar_total = Counter(
|
393
|
+
name="sglang:num_grammar_total",
|
394
|
+
documentation="Number of the total grammar requests.",
|
395
|
+
labelnames=labels.keys(),
|
396
|
+
)
|
397
|
+
self.grammar_schema_count = Histogram(
|
398
|
+
name="sglang:grammar_schema_count",
|
399
|
+
documentation="Histogram of grammar schema count.",
|
400
|
+
labelnames=labels.keys(),
|
401
|
+
buckets=[
|
402
|
+
0,
|
403
|
+
1,
|
404
|
+
2,
|
405
|
+
5,
|
406
|
+
10,
|
407
|
+
20,
|
408
|
+
30,
|
409
|
+
40,
|
410
|
+
60,
|
411
|
+
80,
|
412
|
+
100,
|
413
|
+
120,
|
414
|
+
140,
|
415
|
+
160,
|
416
|
+
180,
|
417
|
+
200,
|
418
|
+
300,
|
419
|
+
400,
|
420
|
+
500,
|
421
|
+
700,
|
422
|
+
1000,
|
423
|
+
],
|
424
|
+
)
|
425
|
+
self.grammar_ebnf_size = Histogram(
|
426
|
+
name="sglang:grammar_ebnf_size",
|
427
|
+
documentation="Histogram of grammar EBNF size.",
|
428
|
+
labelnames=labels.keys(),
|
429
|
+
buckets=[
|
430
|
+
0,
|
431
|
+
50,
|
432
|
+
100,
|
433
|
+
200,
|
434
|
+
300,
|
435
|
+
500,
|
436
|
+
1000,
|
437
|
+
2000,
|
438
|
+
3000,
|
439
|
+
5000,
|
440
|
+
10000,
|
441
|
+
20000,
|
442
|
+
30000,
|
443
|
+
50000,
|
444
|
+
100000,
|
445
|
+
],
|
446
|
+
)
|
447
|
+
|
448
|
+
tree_traversal_time_buckets = [
|
449
|
+
0.0,
|
450
|
+
0.01,
|
451
|
+
0.02,
|
452
|
+
0.05,
|
453
|
+
0.1,
|
454
|
+
0.2,
|
455
|
+
0.5,
|
456
|
+
1,
|
457
|
+
2,
|
458
|
+
5,
|
459
|
+
10,
|
460
|
+
15,
|
461
|
+
30,
|
462
|
+
60,
|
463
|
+
90,
|
464
|
+
120,
|
465
|
+
240,
|
466
|
+
]
|
467
|
+
self.grammar_tree_traversal_time_avg = Histogram(
|
468
|
+
name="sglang:grammar_tree_traversal_time_avg",
|
469
|
+
documentation="Histogram of average grammar tree traversal time in seconds.",
|
470
|
+
labelnames=labels.keys(),
|
471
|
+
buckets=tree_traversal_time_buckets,
|
472
|
+
)
|
473
|
+
self.grammar_tree_traversal_time_max = Histogram(
|
474
|
+
name="sglang:grammar_tree_traversal_time_max",
|
475
|
+
documentation="Histogram of max grammar tree traversal time in seconds.",
|
476
|
+
labelnames=labels.keys(),
|
477
|
+
buckets=tree_traversal_time_buckets,
|
478
|
+
)
|
479
|
+
|
480
|
+
self.per_stage_req_latency_seconds = Histogram(
|
481
|
+
name="sglang:per_stage_req_latency_seconds",
|
482
|
+
documentation="The latency of each stage of requests.",
|
483
|
+
# captures latency in range [1ms - ~1191s]
|
484
|
+
buckets=exponential_buckets(start=0.001, width=1.62, length=30),
|
485
|
+
labelnames=list(labels.keys()) + ["stage"],
|
486
|
+
)
|
272
487
|
|
273
488
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
274
489
|
# Convenience function for logging to gauge.
|
275
490
|
gauge.labels(**self.labels).set(data)
|
276
491
|
|
492
|
+
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
|
493
|
+
histogram.labels(**self.labels).observe(data)
|
494
|
+
|
277
495
|
def increment_bootstrap_failed_reqs(self) -> None:
|
278
496
|
self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
|
279
497
|
|
280
498
|
def increment_transfer_failed_reqs(self) -> None:
|
281
499
|
self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
|
282
500
|
|
501
|
+
def observe_per_stage_req_latency(self, stage: str, latency: float) -> None:
|
502
|
+
labels_with_stage = {**self.labels, "stage": stage}
|
503
|
+
self.per_stage_req_latency_seconds.labels(**labels_with_stage).observe(latency)
|
504
|
+
|
505
|
+
def observe_queue_time(self, latency: float) -> None:
|
506
|
+
self._log_histogram(self.queue_time, latency)
|
507
|
+
|
283
508
|
def log_stats(self, stats: SchedulerStats) -> None:
|
284
509
|
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
|
285
510
|
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
|
286
511
|
self._log_gauge(self.token_usage, stats.token_usage)
|
512
|
+
self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
|
287
513
|
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
288
514
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
289
515
|
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
516
|
+
self._log_gauge(
|
517
|
+
self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
|
518
|
+
)
|
290
519
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
520
|
+
|
521
|
+
# Speculative decoding
|
291
522
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
292
|
-
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
293
523
|
|
294
|
-
#
|
524
|
+
# PD disaggregation
|
295
525
|
self._log_gauge(
|
296
526
|
self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
|
297
527
|
)
|
@@ -304,15 +534,58 @@ class SchedulerMetricsCollector:
|
|
304
534
|
self._log_gauge(
|
305
535
|
self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
|
306
536
|
)
|
537
|
+
self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
|
538
|
+
self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
|
539
|
+
|
540
|
+
# Retract
|
541
|
+
self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
|
542
|
+
self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
|
543
|
+
|
544
|
+
# Utilization
|
545
|
+
self._log_gauge(self.utilization, stats.utilization)
|
546
|
+
if stats.max_running_requests_under_SLO is not None:
|
547
|
+
self._log_gauge(
|
548
|
+
self.max_running_requests_under_SLO,
|
549
|
+
stats.max_running_requests_under_SLO,
|
550
|
+
)
|
551
|
+
|
552
|
+
# Engine startup time
|
553
|
+
self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
|
554
|
+
if stats.engine_load_weights_time is not None:
|
555
|
+
self._log_gauge(
|
556
|
+
self.engine_load_weights_time, stats.engine_load_weights_time
|
557
|
+
)
|
307
558
|
|
308
559
|
self.last_log_time = time.perf_counter()
|
309
560
|
|
561
|
+
def log_grammar_stats(self, grammar_stats) -> None:
|
562
|
+
# Duck-typed GrammarStats to avoid cross-package dependency
|
563
|
+
if getattr(grammar_stats, "compilation_time", None) is not None:
|
564
|
+
self._log_histogram(
|
565
|
+
self.grammar_compilation_time, grammar_stats.compilation_time
|
566
|
+
)
|
567
|
+
if getattr(grammar_stats, "schema_count", None) is not None:
|
568
|
+
self._log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
|
569
|
+
if getattr(grammar_stats, "ebnf_size", None) is not None:
|
570
|
+
self._log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
|
571
|
+
tree_times = getattr(grammar_stats, "tree_traversal_time", None)
|
572
|
+
if tree_times:
|
573
|
+
max_time = max(tree_times)
|
574
|
+
avg_time = sum(tree_times) / len(tree_times)
|
575
|
+
self._log_histogram(self.grammar_tree_traversal_time_max, max_time)
|
576
|
+
self._log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
|
577
|
+
if getattr(grammar_stats, "is_cache_hit", False):
|
578
|
+
self.num_grammar_cache_hit.labels(**self.labels).inc(1)
|
579
|
+
if getattr(grammar_stats, "is_grammar_aborted", False):
|
580
|
+
self.num_grammar_aborted.labels(**self.labels).inc(1)
|
581
|
+
self.num_grammar_total.labels(**self.labels).inc(1)
|
582
|
+
|
310
583
|
|
311
584
|
class TokenizerMetricsCollector:
|
312
585
|
def __init__(
|
313
586
|
self,
|
314
|
-
server_args: ServerArgs,
|
315
|
-
labels: Dict[str, str],
|
587
|
+
server_args: Optional[ServerArgs] = None,
|
588
|
+
labels: Dict[str, str] = None,
|
316
589
|
bucket_time_to_first_token: Optional[List[float]] = None,
|
317
590
|
bucket_inter_token_latency: Optional[List[float]] = None,
|
318
591
|
bucket_e2e_request_latency: Optional[List[float]] = None,
|
@@ -321,7 +594,7 @@ class TokenizerMetricsCollector:
|
|
321
594
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
322
595
|
from prometheus_client import Counter, Histogram
|
323
596
|
|
324
|
-
self.labels = labels
|
597
|
+
self.labels = labels or {}
|
325
598
|
self.collect_tokens_histogram = collect_tokens_histogram
|
326
599
|
|
327
600
|
self.prompt_tokens_total = Counter(
|
@@ -361,6 +634,13 @@ class TokenizerMetricsCollector:
|
|
361
634
|
30000,
|
362
635
|
35000,
|
363
636
|
40000,
|
637
|
+
66000,
|
638
|
+
99000,
|
639
|
+
132000,
|
640
|
+
300000,
|
641
|
+
600000,
|
642
|
+
900000,
|
643
|
+
1100000,
|
364
644
|
]
|
365
645
|
self.prompt_tokens_histogram = Histogram(
|
366
646
|
name="sglang:prompt_tokens_histogram",
|
@@ -370,34 +650,13 @@ class TokenizerMetricsCollector:
|
|
370
650
|
server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
|
371
651
|
),
|
372
652
|
)
|
373
|
-
default_bucket_generation_tokens = [
|
374
|
-
100,
|
375
|
-
300,
|
376
|
-
500,
|
377
|
-
1000,
|
378
|
-
1200,
|
379
|
-
1500,
|
380
|
-
1700,
|
381
|
-
2000,
|
382
|
-
2500,
|
383
|
-
3000,
|
384
|
-
3500,
|
385
|
-
4000,
|
386
|
-
4500,
|
387
|
-
5000,
|
388
|
-
6000,
|
389
|
-
7000,
|
390
|
-
8000,
|
391
|
-
9000,
|
392
|
-
10000,
|
393
|
-
]
|
394
653
|
self.generation_tokens_histogram = Histogram(
|
395
654
|
name="sglang:generation_tokens_histogram",
|
396
655
|
documentation="Histogram of generation token length.",
|
397
656
|
labelnames=labels.keys(),
|
398
657
|
buckets=generate_buckets(
|
399
658
|
server_args.generation_tokens_buckets,
|
400
|
-
|
659
|
+
default_bucket_prompt_tokens,
|
401
660
|
),
|
402
661
|
)
|
403
662
|
|
@@ -420,7 +679,7 @@ class TokenizerMetricsCollector:
|
|
420
679
|
)
|
421
680
|
|
422
681
|
self.num_aborted_requests_total = Counter(
|
423
|
-
name="sglang:
|
682
|
+
name="sglang:num_aborted_requests_total",
|
424
683
|
documentation="Number of requests aborted.",
|
425
684
|
labelnames=labels.keys(),
|
426
685
|
)
|
@@ -467,7 +726,10 @@ class TokenizerMetricsCollector:
|
|
467
726
|
100,
|
468
727
|
200,
|
469
728
|
400,
|
470
|
-
|
729
|
+
600,
|
730
|
+
1200,
|
731
|
+
1800,
|
732
|
+
2400,
|
471
733
|
]
|
472
734
|
|
473
735
|
if bucket_inter_token_latency is None:
|
@@ -504,7 +766,7 @@ class TokenizerMetricsCollector:
|
|
504
766
|
buckets=bucket_time_to_first_token,
|
505
767
|
)
|
506
768
|
|
507
|
-
self.
|
769
|
+
self.histogram_inter_token_latency = Histogram(
|
508
770
|
name="sglang:inter_token_latency_seconds",
|
509
771
|
documentation="Histogram of inter-token latency in seconds.",
|
510
772
|
labelnames=labels.keys(),
|
@@ -518,38 +780,53 @@ class TokenizerMetricsCollector:
|
|
518
780
|
buckets=bucket_e2e_request_latency,
|
519
781
|
)
|
520
782
|
|
521
|
-
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
|
522
|
-
histogram.labels(**self.labels).observe(data)
|
523
|
-
|
524
783
|
def observe_one_finished_request(
|
525
784
|
self,
|
785
|
+
labels: Dict[str, str],
|
526
786
|
prompt_tokens: int,
|
527
787
|
generation_tokens: int,
|
528
788
|
cached_tokens: int,
|
529
789
|
e2e_latency: float,
|
530
790
|
has_grammar: bool,
|
531
791
|
):
|
532
|
-
self.prompt_tokens_total.labels(**
|
533
|
-
self.generation_tokens_total.labels(**
|
792
|
+
self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
|
793
|
+
self.generation_tokens_total.labels(**labels).inc(generation_tokens)
|
534
794
|
if cached_tokens > 0:
|
535
|
-
self.cached_tokens_total.labels(**
|
536
|
-
self.num_requests_total.labels(**
|
795
|
+
self.cached_tokens_total.labels(**labels).inc(cached_tokens)
|
796
|
+
self.num_requests_total.labels(**labels).inc(1)
|
537
797
|
if has_grammar:
|
538
|
-
self.num_so_requests_total.labels(**
|
539
|
-
self.
|
798
|
+
self.num_so_requests_total.labels(**labels).inc(1)
|
799
|
+
self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
|
540
800
|
if self.collect_tokens_histogram:
|
541
|
-
self.
|
542
|
-
self.
|
543
|
-
|
544
|
-
|
545
|
-
self.histogram_time_to_first_token.labels(**self.labels).observe(value)
|
801
|
+
self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
|
802
|
+
self.generation_tokens_histogram.labels(**labels).observe(
|
803
|
+
float(generation_tokens)
|
804
|
+
)
|
546
805
|
|
547
|
-
def
|
806
|
+
def observe_time_to_first_token(self, labels: Dict[str, str], value: float):
|
807
|
+
self.histogram_time_to_first_token.labels(**labels).observe(value)
|
808
|
+
|
809
|
+
def check_time_to_first_token_straggler(self, value: float) -> bool:
|
810
|
+
his = self.histogram_time_to_first_token.labels(**self.labels)
|
811
|
+
total_observations = sum(bucket._value for bucket in his._buckets)
|
812
|
+
if total_observations < 1000:
|
813
|
+
return False
|
814
|
+
p999_threshold = total_observations * 0.999
|
815
|
+
cumulative_count = 0
|
816
|
+
for i, bucket in enumerate(his._buckets):
|
817
|
+
cumulative_count += bucket._value
|
818
|
+
if cumulative_count > p999_threshold:
|
819
|
+
return value >= his._upper_bounds[i]
|
820
|
+
return False
|
821
|
+
|
822
|
+
def observe_inter_token_latency(
|
823
|
+
self, labels: Dict[str, str], internval: float, num_new_tokens: int
|
824
|
+
):
|
548
825
|
adjusted_interval = internval / num_new_tokens
|
549
826
|
|
550
827
|
# A faster version of the Histogram::observe which observes multiple values at the same time.
|
551
828
|
# reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
|
552
|
-
his = self.
|
829
|
+
his = self.histogram_inter_token_latency.labels(**labels)
|
553
830
|
his._sum.inc(internval)
|
554
831
|
|
555
832
|
for i, bound in enumerate(his._upper_bounds):
|
@@ -557,5 +834,107 @@ class TokenizerMetricsCollector:
|
|
557
834
|
his._buckets[i].inc(num_new_tokens)
|
558
835
|
break
|
559
836
|
|
560
|
-
def observe_one_aborted_request(self):
|
561
|
-
self.num_aborted_requests_total.labels(**
|
837
|
+
def observe_one_aborted_request(self, labels: Dict[str, str]):
|
838
|
+
self.num_aborted_requests_total.labels(**labels).inc(1)
|
839
|
+
|
840
|
+
|
841
|
+
@dataclass
|
842
|
+
class StorageMetrics:
|
843
|
+
prefetch_pgs: List[int] = field(default_factory=list)
|
844
|
+
backup_pgs: List[int] = field(default_factory=list)
|
845
|
+
prefetch_bandwidth: List[float] = field(default_factory=list)
|
846
|
+
backup_bandwidth: List[float] = field(default_factory=list)
|
847
|
+
|
848
|
+
|
849
|
+
class StorageMetricsCollector:
|
850
|
+
def __init__(
|
851
|
+
self,
|
852
|
+
labels: Dict[str, str],
|
853
|
+
):
|
854
|
+
from prometheus_client import Counter, Histogram
|
855
|
+
|
856
|
+
self.labels = labels
|
857
|
+
|
858
|
+
self.prefetched_tokens_total = Counter(
|
859
|
+
name="sglang:prefetched_tokens_total",
|
860
|
+
documentation="Number of prefetched prompt tokens.",
|
861
|
+
labelnames=labels.keys(),
|
862
|
+
)
|
863
|
+
|
864
|
+
self.backuped_tokens_total = Counter(
|
865
|
+
name="sglang:backuped_tokens_total",
|
866
|
+
documentation="Number of backuped tokens.",
|
867
|
+
labelnames=labels.keys(),
|
868
|
+
)
|
869
|
+
|
870
|
+
bucket_io = [
|
871
|
+
1,
|
872
|
+
5,
|
873
|
+
10,
|
874
|
+
50,
|
875
|
+
100,
|
876
|
+
]
|
877
|
+
|
878
|
+
bucket_bandwidth = [
|
879
|
+
0.1,
|
880
|
+
0.5,
|
881
|
+
1,
|
882
|
+
5,
|
883
|
+
10,
|
884
|
+
50,
|
885
|
+
100,
|
886
|
+
]
|
887
|
+
|
888
|
+
self.histogram_prefetch_pgs = Histogram(
|
889
|
+
name="sglang:prefetch_pgs",
|
890
|
+
documentation="Histogram of prefetch pages of batches.",
|
891
|
+
labelnames=labels.keys(),
|
892
|
+
buckets=bucket_io,
|
893
|
+
)
|
894
|
+
|
895
|
+
self.histogram_backup_pgs = Histogram(
|
896
|
+
name="sglang:backup_pgs",
|
897
|
+
documentation="Histogram of backup pages of batches.",
|
898
|
+
labelnames=labels.keys(),
|
899
|
+
buckets=bucket_io,
|
900
|
+
)
|
901
|
+
|
902
|
+
self.histogram_prefetch_bandwidth = Histogram(
|
903
|
+
name="sglang:prefetch_bandwidth",
|
904
|
+
documentation="Histogram of prefetch bandwidth in GB/s.",
|
905
|
+
labelnames=labels.keys(),
|
906
|
+
buckets=bucket_bandwidth,
|
907
|
+
)
|
908
|
+
|
909
|
+
self.histogram_backup_bandwidth = Histogram(
|
910
|
+
name="sglang:backup_bandwidth",
|
911
|
+
documentation="Histogram of backup bandwidth in GB/s.",
|
912
|
+
labelnames=labels.keys(),
|
913
|
+
buckets=bucket_bandwidth,
|
914
|
+
)
|
915
|
+
|
916
|
+
def log_prefetched_tokens(self, prefetched_tokens: int):
|
917
|
+
if prefetched_tokens > 0:
|
918
|
+
self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
|
919
|
+
|
920
|
+
def log_backuped_tokens(self, backuped_tokens: int):
|
921
|
+
if backuped_tokens > 0:
|
922
|
+
self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
|
923
|
+
|
924
|
+
def _log_histogram(self, histogram, data: Union[int, float]):
|
925
|
+
histogram.labels(**self.labels).observe(data)
|
926
|
+
|
927
|
+
def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
|
928
|
+
if storage_metrics is None:
|
929
|
+
return
|
930
|
+
|
931
|
+
assert isinstance(storage_metrics, StorageMetrics)
|
932
|
+
|
933
|
+
for v in storage_metrics.prefetch_pgs:
|
934
|
+
self._log_histogram(self.histogram_prefetch_pgs, v)
|
935
|
+
for v in storage_metrics.backup_pgs:
|
936
|
+
self._log_histogram(self.histogram_backup_pgs, v)
|
937
|
+
for v in storage_metrics.prefetch_bandwidth:
|
938
|
+
self._log_histogram(self.histogram_prefetch_bandwidth, v)
|
939
|
+
for v in storage_metrics.backup_bandwidth:
|
940
|
+
self._log_histogram(self.histogram_backup_bandwidth, v)
|