sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +67 -43
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +88 -53
- sglang/srt/entrypoints/openai/protocol.py +7 -4
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +39 -19
- sglang/srt/entrypoints/openai/serving_completions.py +15 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -7
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +182 -49
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +68 -41
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +0 -18
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +200 -199
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +191 -139
- sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +260 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +18 -33
- sglang/srt/mem_cache/hiradix_cache.py +108 -48
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +121 -57
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +502 -77
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +75 -19
- sglang/srt/model_executor/model_runner.py +357 -30
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +346 -48
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +11 -2
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +60 -13
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +40 -9
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +355 -37
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +197 -112
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +46 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +12 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
sglang/srt/metrics/collector.py
CHANGED
@@ -12,12 +12,13 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
"""Utilities for Prometheus Metrics Collection."""
|
15
|
-
|
16
15
|
import time
|
17
|
-
from dataclasses import dataclass
|
16
|
+
from dataclasses import dataclass, field
|
18
17
|
from enum import Enum
|
19
18
|
from typing import Dict, List, Optional, Union
|
20
19
|
|
20
|
+
from sglang.srt.metrics.utils import generate_buckets
|
21
|
+
from sglang.srt.server_args import ServerArgs
|
21
22
|
from sglang.srt.utils import get_bool_env_var
|
22
23
|
|
23
24
|
SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
|
@@ -48,6 +49,9 @@ class TimeStats:
|
|
48
49
|
DECODE = "decode"
|
49
50
|
INVALID = "invalid"
|
50
51
|
|
52
|
+
def get_queueing_time(self) -> float:
|
53
|
+
return self.forward_entry_time - self.wait_queue_entry_time
|
54
|
+
|
51
55
|
def __str__(self) -> str:
|
52
56
|
# if unified
|
53
57
|
_type = self.get_type()
|
@@ -132,27 +136,48 @@ class TimeStats:
|
|
132
136
|
|
133
137
|
@dataclass
|
134
138
|
class SchedulerStats:
|
139
|
+
# Basics
|
135
140
|
num_running_reqs: int = 0
|
136
141
|
num_used_tokens: int = 0
|
137
142
|
token_usage: float = 0.0
|
143
|
+
swa_token_usage: float = 0.0
|
138
144
|
gen_throughput: float = 0.0
|
139
145
|
num_queue_reqs: int = 0
|
140
|
-
cache_hit_rate: float = 0.0
|
141
146
|
num_grammar_queue_reqs: int = 0
|
142
|
-
|
147
|
+
num_running_reqs_offline_batch: int = 0
|
143
148
|
avg_request_queue_latency: float = 0.0
|
149
|
+
cache_hit_rate: float = 0.0
|
150
|
+
|
151
|
+
# Speculative decoding
|
152
|
+
spec_accept_length: float = 0.0
|
153
|
+
|
154
|
+
# PD disaggregation
|
144
155
|
num_prefill_prealloc_queue_reqs: int = 0
|
145
156
|
num_prefill_inflight_queue_reqs: int = 0
|
146
157
|
num_decode_prealloc_queue_reqs: int = 0
|
147
158
|
num_decode_transfer_queue_reqs: int = 0
|
159
|
+
kv_transfer_speed_gb_s: float = 0.0
|
160
|
+
kv_transfer_latency_ms: float = 0.0
|
161
|
+
|
162
|
+
# Retract
|
148
163
|
total_retracted_reqs: int = 0
|
164
|
+
num_retracted_reqs: int = 0
|
165
|
+
num_paused_reqs: int = 0
|
166
|
+
|
167
|
+
# Utilization
|
168
|
+
utilization: float = 0.0
|
169
|
+
max_running_requests_under_SLO: Optional[int] = None
|
170
|
+
|
171
|
+
# Engine startup
|
172
|
+
engine_startup_time: float = 0.0
|
173
|
+
engine_load_weights_time: float = 0.0
|
149
174
|
|
150
175
|
|
151
176
|
class SchedulerMetricsCollector:
|
152
177
|
|
153
178
|
def __init__(self, labels: Dict[str, str]) -> None:
|
154
179
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
155
|
-
from prometheus_client import Counter, Gauge
|
180
|
+
from prometheus_client import Counter, Gauge, Histogram
|
156
181
|
|
157
182
|
self.labels = labels
|
158
183
|
self.last_log_time = time.perf_counter()
|
@@ -163,115 +188,338 @@ class SchedulerMetricsCollector:
|
|
163
188
|
labelnames=labels.keys(),
|
164
189
|
multiprocess_mode="mostrecent",
|
165
190
|
)
|
166
|
-
|
167
191
|
self.num_used_tokens = Gauge(
|
168
192
|
name="sglang:num_used_tokens",
|
169
193
|
documentation="The number of used tokens.",
|
170
194
|
labelnames=labels.keys(),
|
171
195
|
multiprocess_mode="mostrecent",
|
172
196
|
)
|
173
|
-
|
174
197
|
self.token_usage = Gauge(
|
175
198
|
name="sglang:token_usage",
|
176
199
|
documentation="The token usage.",
|
177
200
|
labelnames=labels.keys(),
|
178
201
|
multiprocess_mode="mostrecent",
|
179
202
|
)
|
180
|
-
|
203
|
+
self.swa_token_usage = Gauge(
|
204
|
+
name="sglang:swa_token_usage",
|
205
|
+
documentation="The token usage for SWA layers.",
|
206
|
+
labelnames=labels.keys(),
|
207
|
+
multiprocess_mode="mostrecent",
|
208
|
+
)
|
181
209
|
self.gen_throughput = Gauge(
|
182
210
|
name="sglang:gen_throughput",
|
183
211
|
documentation="The generation throughput (token/s).",
|
184
212
|
labelnames=labels.keys(),
|
185
213
|
multiprocess_mode="mostrecent",
|
186
214
|
)
|
187
|
-
|
188
215
|
self.num_queue_reqs = Gauge(
|
189
216
|
name="sglang:num_queue_reqs",
|
190
217
|
documentation="The number of requests in the waiting queue.",
|
191
218
|
labelnames=labels.keys(),
|
192
219
|
multiprocess_mode="mostrecent",
|
193
220
|
)
|
194
|
-
|
195
221
|
self.num_grammar_queue_reqs = Gauge(
|
196
222
|
name="sglang:num_grammar_queue_reqs",
|
197
223
|
documentation="The number of requests in the grammar waiting queue.",
|
198
224
|
labelnames=labels.keys(),
|
199
225
|
multiprocess_mode="mostrecent",
|
200
226
|
)
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
documentation="The prefix cache hit rate.",
|
205
|
-
labelnames=labels.keys(),
|
206
|
-
multiprocess_mode="mostrecent",
|
207
|
-
)
|
208
|
-
|
209
|
-
self.spec_accept_length = Gauge(
|
210
|
-
name="sglang:spec_accept_length",
|
211
|
-
documentation="The average acceptance length of speculative decoding.",
|
227
|
+
self.num_running_reqs_offline_batch = Gauge(
|
228
|
+
name="sglang:num_running_reqs_offline_batch",
|
229
|
+
documentation="The number of running low-priority offline batch requests(label is 'batch').",
|
212
230
|
labelnames=labels.keys(),
|
213
231
|
multiprocess_mode="mostrecent",
|
214
232
|
)
|
215
|
-
|
216
233
|
self.avg_request_queue_latency = Gauge(
|
217
234
|
name="sglang:avg_request_queue_latency",
|
218
235
|
documentation="The average request queue latency for the last batch of requests in seconds.",
|
219
236
|
labelnames=labels.keys(),
|
220
237
|
multiprocess_mode="mostrecent",
|
221
238
|
)
|
239
|
+
self.cache_hit_rate = Gauge(
|
240
|
+
name="sglang:cache_hit_rate",
|
241
|
+
documentation="The prefix cache hit rate.",
|
242
|
+
labelnames=labels.keys(),
|
243
|
+
multiprocess_mode="mostrecent",
|
244
|
+
)
|
222
245
|
|
223
|
-
|
224
|
-
|
225
|
-
|
246
|
+
# Speculative decoding
|
247
|
+
self.spec_accept_length = Gauge(
|
248
|
+
name="sglang:spec_accept_length",
|
249
|
+
documentation="The average acceptance length of speculative decoding.",
|
226
250
|
labelnames=labels.keys(),
|
227
251
|
multiprocess_mode="mostrecent",
|
228
252
|
)
|
229
253
|
|
230
|
-
#
|
254
|
+
# PD disaggregation
|
231
255
|
self.num_prefill_prealloc_queue_reqs = Gauge(
|
232
256
|
name="sglang:num_prefill_prealloc_queue_reqs",
|
233
257
|
documentation="The number of requests in the prefill prealloc queue.",
|
234
258
|
labelnames=labels.keys(),
|
235
259
|
multiprocess_mode="mostrecent",
|
236
260
|
)
|
237
|
-
|
238
261
|
self.num_prefill_inflight_queue_reqs = Gauge(
|
239
262
|
name="sglang:num_prefill_inflight_queue_reqs",
|
240
263
|
documentation="The number of requests in the prefill inflight queue.",
|
241
264
|
labelnames=labels.keys(),
|
242
265
|
multiprocess_mode="mostrecent",
|
243
266
|
)
|
244
|
-
|
245
267
|
self.num_decode_prealloc_queue_reqs = Gauge(
|
246
268
|
name="sglang:num_decode_prealloc_queue_reqs",
|
247
269
|
documentation="The number of requests in the decode prealloc queue.",
|
248
270
|
labelnames=labels.keys(),
|
249
271
|
multiprocess_mode="mostrecent",
|
250
272
|
)
|
251
|
-
|
252
273
|
self.num_decode_transfer_queue_reqs = Gauge(
|
253
274
|
name="sglang:num_decode_transfer_queue_reqs",
|
254
275
|
documentation="The number of requests in the decode transfer queue.",
|
255
276
|
labelnames=labels.keys(),
|
256
277
|
multiprocess_mode="mostrecent",
|
257
278
|
)
|
258
|
-
|
259
279
|
self.num_bootstrap_failed_reqs = Counter(
|
260
|
-
name="sglang:
|
280
|
+
name="sglang:num_bootstrap_failed_reqs_total",
|
261
281
|
documentation="The number of bootstrap failed requests.",
|
262
282
|
labelnames=labels.keys(),
|
263
283
|
)
|
264
|
-
|
265
284
|
self.num_transfer_failed_reqs = Counter(
|
266
|
-
name="sglang:
|
285
|
+
name="sglang:num_transfer_failed_reqs_total",
|
267
286
|
documentation="The number of transfer failed requests.",
|
268
287
|
labelnames=labels.keys(),
|
269
288
|
)
|
289
|
+
self.kv_transfer_speed_gb_s = Gauge(
|
290
|
+
name="sglang:kv_transfer_speed_gb_s",
|
291
|
+
documentation="The transfer speed of the KV cache in GB/s.",
|
292
|
+
labelnames=labels.keys(),
|
293
|
+
multiprocess_mode="mostrecent",
|
294
|
+
)
|
295
|
+
self.kv_transfer_latency_ms = Gauge(
|
296
|
+
name="sglang:kv_transfer_latency_ms",
|
297
|
+
documentation="The transfer latency of the KV cache in ms.",
|
298
|
+
labelnames=labels.keys(),
|
299
|
+
multiprocess_mode="mostrecent",
|
300
|
+
)
|
301
|
+
|
302
|
+
# Retract
|
303
|
+
self.total_retracted_reqs = Gauge(
|
304
|
+
name="sglang:total_retracted_reqs",
|
305
|
+
documentation="The total number of retracted requests due to kvcache full.",
|
306
|
+
labelnames=labels.keys(),
|
307
|
+
multiprocess_mode="mostrecent",
|
308
|
+
)
|
309
|
+
self.num_retracted_reqs = Gauge(
|
310
|
+
name="sglang:num_retracted_reqs",
|
311
|
+
documentation="The number of retracted requests.",
|
312
|
+
labelnames=labels.keys(),
|
313
|
+
)
|
314
|
+
self.num_paused_reqs = Gauge(
|
315
|
+
name="sglang:num_paused_reqs",
|
316
|
+
documentation="The number of paused requests by async weight sync.",
|
317
|
+
labelnames=labels.keys(),
|
318
|
+
)
|
319
|
+
|
320
|
+
# Utilization
|
321
|
+
self.utilization = Gauge(
|
322
|
+
name="sglang:utilization",
|
323
|
+
documentation="The utilization.",
|
324
|
+
labelnames=labels.keys(),
|
325
|
+
multiprocess_mode="mostrecent",
|
326
|
+
)
|
327
|
+
self.max_running_requests_under_SLO = Gauge(
|
328
|
+
name="sglang:max_running_requests_under_SLO",
|
329
|
+
documentation="The maximum number of running requests under SLO.",
|
330
|
+
labelnames=labels.keys(),
|
331
|
+
multiprocess_mode="mostrecent",
|
332
|
+
)
|
333
|
+
|
334
|
+
# Engine startup
|
335
|
+
self.engine_startup_time = Gauge(
|
336
|
+
name="sglang:engine_startup_time",
|
337
|
+
documentation="The time taken for the engine to start up.",
|
338
|
+
labelnames=labels.keys(),
|
339
|
+
multiprocess_mode="mostrecent",
|
340
|
+
)
|
341
|
+
self.engine_load_weights_time = Gauge(
|
342
|
+
name="sglang:engine_load_weights_time",
|
343
|
+
documentation="The time taken for the engine to load weights.",
|
344
|
+
labelnames=labels.keys(),
|
345
|
+
multiprocess_mode="mostrecent",
|
346
|
+
)
|
347
|
+
|
348
|
+
# Additional queueing time histogram
|
349
|
+
self.queue_time = Histogram(
|
350
|
+
name="sglang:queue_time_s",
|
351
|
+
documentation="Histogram of queueing time in seconds.",
|
352
|
+
labelnames=labels.keys(),
|
353
|
+
buckets=[
|
354
|
+
0.0,
|
355
|
+
0.1,
|
356
|
+
0.2,
|
357
|
+
0.5,
|
358
|
+
1,
|
359
|
+
2,
|
360
|
+
3,
|
361
|
+
4,
|
362
|
+
5,
|
363
|
+
10,
|
364
|
+
15,
|
365
|
+
20,
|
366
|
+
30,
|
367
|
+
40,
|
368
|
+
50,
|
369
|
+
60,
|
370
|
+
70,
|
371
|
+
80,
|
372
|
+
90,
|
373
|
+
100,
|
374
|
+
200,
|
375
|
+
300,
|
376
|
+
400,
|
377
|
+
500,
|
378
|
+
600,
|
379
|
+
700,
|
380
|
+
800,
|
381
|
+
900,
|
382
|
+
1000,
|
383
|
+
1200,
|
384
|
+
1400,
|
385
|
+
1600,
|
386
|
+
1800,
|
387
|
+
2000,
|
388
|
+
2500,
|
389
|
+
3000,
|
390
|
+
],
|
391
|
+
)
|
392
|
+
|
393
|
+
# Grammar metrics
|
394
|
+
self.grammar_compilation_time = Histogram(
|
395
|
+
name="sglang:grammar_compilation_time_seconds",
|
396
|
+
documentation="Histogram of grammar compilation time in seconds.",
|
397
|
+
labelnames=labels.keys(),
|
398
|
+
buckets=[
|
399
|
+
0.0,
|
400
|
+
0.01,
|
401
|
+
0.02,
|
402
|
+
0.05,
|
403
|
+
0.1,
|
404
|
+
0.2,
|
405
|
+
0.5,
|
406
|
+
1,
|
407
|
+
2,
|
408
|
+
5,
|
409
|
+
10,
|
410
|
+
20,
|
411
|
+
30,
|
412
|
+
60,
|
413
|
+
90,
|
414
|
+
120,
|
415
|
+
240,
|
416
|
+
],
|
417
|
+
)
|
418
|
+
self.num_grammar_cache_hit = Counter(
|
419
|
+
name="sglang:num_grammar_cache_hit_total",
|
420
|
+
documentation="Number of grammar cache hits.",
|
421
|
+
labelnames=labels.keys(),
|
422
|
+
)
|
423
|
+
self.num_grammar_aborted = Counter(
|
424
|
+
name="sglang:num_grammar_aborted_total",
|
425
|
+
documentation="Number of grammar aborted requests.",
|
426
|
+
labelnames=labels.keys(),
|
427
|
+
)
|
428
|
+
self.num_grammar_total = Counter(
|
429
|
+
name="sglang:num_grammar_total",
|
430
|
+
documentation="Number of the total grammar requests.",
|
431
|
+
labelnames=labels.keys(),
|
432
|
+
)
|
433
|
+
self.grammar_schema_count = Histogram(
|
434
|
+
name="sglang:grammar_schema_count",
|
435
|
+
documentation="Histogram of grammar schema count.",
|
436
|
+
labelnames=labels.keys(),
|
437
|
+
buckets=[
|
438
|
+
0,
|
439
|
+
1,
|
440
|
+
2,
|
441
|
+
5,
|
442
|
+
10,
|
443
|
+
20,
|
444
|
+
30,
|
445
|
+
40,
|
446
|
+
60,
|
447
|
+
80,
|
448
|
+
100,
|
449
|
+
120,
|
450
|
+
140,
|
451
|
+
160,
|
452
|
+
180,
|
453
|
+
200,
|
454
|
+
300,
|
455
|
+
400,
|
456
|
+
500,
|
457
|
+
700,
|
458
|
+
1000,
|
459
|
+
],
|
460
|
+
)
|
461
|
+
self.grammar_ebnf_size = Histogram(
|
462
|
+
name="sglang:grammar_ebnf_size",
|
463
|
+
documentation="Histogram of grammar EBNF size.",
|
464
|
+
labelnames=labels.keys(),
|
465
|
+
buckets=[
|
466
|
+
0,
|
467
|
+
50,
|
468
|
+
100,
|
469
|
+
200,
|
470
|
+
300,
|
471
|
+
500,
|
472
|
+
1000,
|
473
|
+
2000,
|
474
|
+
3000,
|
475
|
+
5000,
|
476
|
+
10000,
|
477
|
+
20000,
|
478
|
+
30000,
|
479
|
+
50000,
|
480
|
+
100000,
|
481
|
+
],
|
482
|
+
)
|
483
|
+
|
484
|
+
tree_traversal_time_buckets = [
|
485
|
+
0.0,
|
486
|
+
0.01,
|
487
|
+
0.02,
|
488
|
+
0.05,
|
489
|
+
0.1,
|
490
|
+
0.2,
|
491
|
+
0.5,
|
492
|
+
1,
|
493
|
+
2,
|
494
|
+
5,
|
495
|
+
10,
|
496
|
+
15,
|
497
|
+
30,
|
498
|
+
60,
|
499
|
+
90,
|
500
|
+
120,
|
501
|
+
240,
|
502
|
+
]
|
503
|
+
self.grammar_tree_traversal_time_avg = Histogram(
|
504
|
+
name="sglang:grammar_tree_traversal_time_avg",
|
505
|
+
documentation="Histogram of average grammar tree traversal time in seconds.",
|
506
|
+
labelnames=labels.keys(),
|
507
|
+
buckets=tree_traversal_time_buckets,
|
508
|
+
)
|
509
|
+
self.grammar_tree_traversal_time_max = Histogram(
|
510
|
+
name="sglang:grammar_tree_traversal_time_max",
|
511
|
+
documentation="Histogram of max grammar tree traversal time in seconds.",
|
512
|
+
labelnames=labels.keys(),
|
513
|
+
buckets=tree_traversal_time_buckets,
|
514
|
+
)
|
270
515
|
|
271
516
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
272
517
|
# Convenience function for logging to gauge.
|
273
518
|
gauge.labels(**self.labels).set(data)
|
274
519
|
|
520
|
+
def log_histogram(self, histogram, data: Union[int, float]) -> None:
|
521
|
+
histogram.labels(**self.labels).observe(data)
|
522
|
+
|
275
523
|
def increment_bootstrap_failed_reqs(self) -> None:
|
276
524
|
self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
|
277
525
|
|
@@ -282,14 +530,20 @@ class SchedulerMetricsCollector:
|
|
282
530
|
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
|
283
531
|
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
|
284
532
|
self._log_gauge(self.token_usage, stats.token_usage)
|
533
|
+
self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
|
285
534
|
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
286
535
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
287
536
|
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
537
|
+
self._log_gauge(
|
538
|
+
self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
|
539
|
+
)
|
288
540
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
541
|
+
self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
|
542
|
+
|
543
|
+
# Speculative decoding
|
289
544
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
290
|
-
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
291
545
|
|
292
|
-
#
|
546
|
+
# PD disaggregation
|
293
547
|
self._log_gauge(
|
294
548
|
self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
|
295
549
|
)
|
@@ -302,14 +556,59 @@ class SchedulerMetricsCollector:
|
|
302
556
|
self._log_gauge(
|
303
557
|
self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
|
304
558
|
)
|
559
|
+
self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
|
560
|
+
self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
|
561
|
+
|
562
|
+
# Retract
|
563
|
+
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
564
|
+
self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
|
565
|
+
self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
|
566
|
+
|
567
|
+
# Utilization
|
568
|
+
self._log_gauge(self.utilization, stats.utilization)
|
569
|
+
if stats.max_running_requests_under_SLO is not None:
|
570
|
+
self._log_gauge(
|
571
|
+
self.max_running_requests_under_SLO,
|
572
|
+
stats.max_running_requests_under_SLO,
|
573
|
+
)
|
574
|
+
|
575
|
+
# Engine startup time
|
576
|
+
self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
|
577
|
+
if stats.engine_load_weights_time is not None:
|
578
|
+
self._log_gauge(
|
579
|
+
self.engine_load_weights_time, stats.engine_load_weights_time
|
580
|
+
)
|
305
581
|
|
306
582
|
self.last_log_time = time.perf_counter()
|
307
583
|
|
584
|
+
def log_grammar_stats(self, grammar_stats) -> None:
|
585
|
+
# Duck-typed GrammarStats to avoid cross-package dependency
|
586
|
+
if getattr(grammar_stats, "compilation_time", None) is not None:
|
587
|
+
self.log_histogram(
|
588
|
+
self.grammar_compilation_time, grammar_stats.compilation_time
|
589
|
+
)
|
590
|
+
if getattr(grammar_stats, "schema_count", None) is not None:
|
591
|
+
self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
|
592
|
+
if getattr(grammar_stats, "ebnf_size", None) is not None:
|
593
|
+
self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
|
594
|
+
tree_times = getattr(grammar_stats, "tree_traversal_time", None)
|
595
|
+
if tree_times:
|
596
|
+
max_time = max(tree_times)
|
597
|
+
avg_time = sum(tree_times) / len(tree_times)
|
598
|
+
self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
|
599
|
+
self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
|
600
|
+
if getattr(grammar_stats, "is_cache_hit", False):
|
601
|
+
self.num_grammar_cache_hit.labels(**self.labels).inc(1)
|
602
|
+
if getattr(grammar_stats, "is_grammar_aborted", False):
|
603
|
+
self.num_grammar_aborted.labels(**self.labels).inc(1)
|
604
|
+
self.num_grammar_total.labels(**self.labels).inc(1)
|
605
|
+
|
308
606
|
|
309
607
|
class TokenizerMetricsCollector:
|
310
608
|
def __init__(
|
311
609
|
self,
|
312
|
-
|
610
|
+
server_args: Optional[ServerArgs] = None,
|
611
|
+
labels: Dict[str, str] = None,
|
313
612
|
bucket_time_to_first_token: Optional[List[float]] = None,
|
314
613
|
bucket_inter_token_latency: Optional[List[float]] = None,
|
315
614
|
bucket_e2e_request_latency: Optional[List[float]] = None,
|
@@ -318,7 +617,7 @@ class TokenizerMetricsCollector:
|
|
318
617
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
319
618
|
from prometheus_client import Counter, Histogram
|
320
619
|
|
321
|
-
self.labels = labels
|
620
|
+
self.labels = labels or {}
|
322
621
|
self.collect_tokens_histogram = collect_tokens_histogram
|
323
622
|
|
324
623
|
self.prompt_tokens_total = Counter(
|
@@ -334,7 +633,7 @@ class TokenizerMetricsCollector:
|
|
334
633
|
)
|
335
634
|
|
336
635
|
if collect_tokens_histogram:
|
337
|
-
|
636
|
+
default_bucket_prompt_tokens = [
|
338
637
|
100,
|
339
638
|
300,
|
340
639
|
500,
|
@@ -358,39 +657,30 @@ class TokenizerMetricsCollector:
|
|
358
657
|
30000,
|
359
658
|
35000,
|
360
659
|
40000,
|
660
|
+
66000,
|
661
|
+
99000,
|
662
|
+
132000,
|
663
|
+
300000,
|
664
|
+
600000,
|
665
|
+
900000,
|
666
|
+
1100000,
|
361
667
|
]
|
362
668
|
self.prompt_tokens_histogram = Histogram(
|
363
669
|
name="sglang:prompt_tokens_histogram",
|
364
670
|
documentation="Histogram of prompt token length.",
|
365
671
|
labelnames=labels.keys(),
|
366
|
-
buckets=
|
672
|
+
buckets=generate_buckets(
|
673
|
+
server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
|
674
|
+
),
|
367
675
|
)
|
368
|
-
bucket_generation_tokens = [
|
369
|
-
100,
|
370
|
-
300,
|
371
|
-
500,
|
372
|
-
1000,
|
373
|
-
1200,
|
374
|
-
1500,
|
375
|
-
1700,
|
376
|
-
2000,
|
377
|
-
2500,
|
378
|
-
3000,
|
379
|
-
3500,
|
380
|
-
4000,
|
381
|
-
4500,
|
382
|
-
5000,
|
383
|
-
6000,
|
384
|
-
7000,
|
385
|
-
8000,
|
386
|
-
9000,
|
387
|
-
10000,
|
388
|
-
]
|
389
676
|
self.generation_tokens_histogram = Histogram(
|
390
677
|
name="sglang:generation_tokens_histogram",
|
391
678
|
documentation="Histogram of generation token length.",
|
392
679
|
labelnames=labels.keys(),
|
393
|
-
buckets=
|
680
|
+
buckets=generate_buckets(
|
681
|
+
server_args.generation_tokens_buckets,
|
682
|
+
default_bucket_prompt_tokens,
|
683
|
+
),
|
394
684
|
)
|
395
685
|
|
396
686
|
self.cached_tokens_total = Counter(
|
@@ -459,7 +749,10 @@ class TokenizerMetricsCollector:
|
|
459
749
|
100,
|
460
750
|
200,
|
461
751
|
400,
|
462
|
-
|
752
|
+
600,
|
753
|
+
1200,
|
754
|
+
1800,
|
755
|
+
2400,
|
463
756
|
]
|
464
757
|
|
465
758
|
if bucket_inter_token_latency is None:
|
@@ -510,38 +803,68 @@ class TokenizerMetricsCollector:
|
|
510
803
|
buckets=bucket_e2e_request_latency,
|
511
804
|
)
|
512
805
|
|
513
|
-
|
514
|
-
|
806
|
+
# Offline batch specific TTFB histogram
|
807
|
+
self.histogram_time_to_first_token_offline_batch = Histogram(
|
808
|
+
name="sglang:time_to_first_token_seconds_offline_batch",
|
809
|
+
documentation="Histogram of time to first token in seconds for offline batch requests.",
|
810
|
+
labelnames=labels.keys(),
|
811
|
+
buckets=bucket_time_to_first_token,
|
812
|
+
)
|
515
813
|
|
516
814
|
def observe_one_finished_request(
|
517
815
|
self,
|
816
|
+
labels: Dict[str, str],
|
518
817
|
prompt_tokens: int,
|
519
818
|
generation_tokens: int,
|
520
819
|
cached_tokens: int,
|
521
820
|
e2e_latency: float,
|
522
821
|
has_grammar: bool,
|
523
822
|
):
|
524
|
-
self.prompt_tokens_total.labels(**
|
525
|
-
self.generation_tokens_total.labels(**
|
823
|
+
self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
|
824
|
+
self.generation_tokens_total.labels(**labels).inc(generation_tokens)
|
526
825
|
if cached_tokens > 0:
|
527
|
-
self.cached_tokens_total.labels(**
|
528
|
-
self.num_requests_total.labels(**
|
826
|
+
self.cached_tokens_total.labels(**labels).inc(cached_tokens)
|
827
|
+
self.num_requests_total.labels(**labels).inc(1)
|
529
828
|
if has_grammar:
|
530
|
-
self.num_so_requests_total.labels(**
|
531
|
-
self.
|
829
|
+
self.num_so_requests_total.labels(**labels).inc(1)
|
830
|
+
self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
|
532
831
|
if self.collect_tokens_histogram:
|
533
|
-
self.
|
534
|
-
self.
|
535
|
-
|
536
|
-
|
537
|
-
self.histogram_time_to_first_token.labels(**self.labels).observe(value)
|
832
|
+
self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
|
833
|
+
self.generation_tokens_histogram.labels(**labels).observe(
|
834
|
+
float(generation_tokens)
|
835
|
+
)
|
538
836
|
|
539
|
-
def
|
837
|
+
def observe_time_to_first_token(
|
838
|
+
self, labels: Dict[str, str], value: float, type: str = ""
|
839
|
+
):
|
840
|
+
if type == "batch":
|
841
|
+
self.histogram_time_to_first_token_offline_batch.labels(**labels).observe(
|
842
|
+
value
|
843
|
+
)
|
844
|
+
else:
|
845
|
+
self.histogram_time_to_first_token.labels(**labels).observe(value)
|
846
|
+
|
847
|
+
def check_time_to_first_token_straggler(self, value: float) -> bool:
|
848
|
+
his = self.histogram_time_to_first_token.labels(**self.labels)
|
849
|
+
total_observations = sum(bucket._value for bucket in his._buckets)
|
850
|
+
if total_observations < 100:
|
851
|
+
return False
|
852
|
+
p99_threshold = total_observations * 0.99
|
853
|
+
cumulative_count = 0
|
854
|
+
for i, bucket in enumerate(his._buckets):
|
855
|
+
cumulative_count += bucket._value
|
856
|
+
if cumulative_count > p99_threshold:
|
857
|
+
return value >= his._upper_bounds[i]
|
858
|
+
return False
|
859
|
+
|
860
|
+
def observe_inter_token_latency(
|
861
|
+
self, labels: Dict[str, str], internval: float, num_new_tokens: int
|
862
|
+
):
|
540
863
|
adjusted_interval = internval / num_new_tokens
|
541
864
|
|
542
865
|
# A faster version of the Histogram::observe which observes multiple values at the same time.
|
543
866
|
# reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
|
544
|
-
his = self.histogram_inter_token_latency_seconds.labels(**
|
867
|
+
his = self.histogram_inter_token_latency_seconds.labels(**labels)
|
545
868
|
his._sum.inc(internval)
|
546
869
|
|
547
870
|
for i, bound in enumerate(his._upper_bounds):
|
@@ -551,3 +874,105 @@ class TokenizerMetricsCollector:
|
|
551
874
|
|
552
875
|
def observe_one_aborted_request(self):
|
553
876
|
self.num_aborted_requests_total.labels(**self.labels).inc(1)
|
877
|
+
|
878
|
+
|
879
|
+
@dataclass
|
880
|
+
class StorageMetrics:
|
881
|
+
prefetch_pgs: List[int] = field(default_factory=list)
|
882
|
+
backup_pgs: List[int] = field(default_factory=list)
|
883
|
+
prefetch_bandwidth: List[float] = field(default_factory=list)
|
884
|
+
backup_bandwidth: List[float] = field(default_factory=list)
|
885
|
+
|
886
|
+
|
887
|
+
class StorageMetricsCollector:
|
888
|
+
def __init__(
|
889
|
+
self,
|
890
|
+
labels: Dict[str, str],
|
891
|
+
):
|
892
|
+
from prometheus_client import Counter, Histogram
|
893
|
+
|
894
|
+
self.labels = labels
|
895
|
+
|
896
|
+
self.prefetched_tokens_total = Counter(
|
897
|
+
name="sglang:prefetched_tokens_total",
|
898
|
+
documentation="Number of prefetched prompt tokens.",
|
899
|
+
labelnames=labels.keys(),
|
900
|
+
)
|
901
|
+
|
902
|
+
self.backuped_tokens_total = Counter(
|
903
|
+
name="sglang:backuped_tokens_total",
|
904
|
+
documentation="Number of backuped tokens.",
|
905
|
+
labelnames=labels.keys(),
|
906
|
+
)
|
907
|
+
|
908
|
+
bucket_io = [
|
909
|
+
1,
|
910
|
+
5,
|
911
|
+
10,
|
912
|
+
50,
|
913
|
+
100,
|
914
|
+
]
|
915
|
+
|
916
|
+
bucket_bandwidth = [
|
917
|
+
0.1,
|
918
|
+
0.5,
|
919
|
+
1,
|
920
|
+
5,
|
921
|
+
10,
|
922
|
+
50,
|
923
|
+
100,
|
924
|
+
]
|
925
|
+
|
926
|
+
self.histogram_prefetch_pgs = Histogram(
|
927
|
+
name="sglang:prefetch_pgs",
|
928
|
+
documentation="Histogram of prefetch pages of batches.",
|
929
|
+
labelnames=labels.keys(),
|
930
|
+
buckets=bucket_io,
|
931
|
+
)
|
932
|
+
|
933
|
+
self.histogram_backup_pgs = Histogram(
|
934
|
+
name="sglang:backup_pgs",
|
935
|
+
documentation="Histogram of backup pages of batches.",
|
936
|
+
labelnames=labels.keys(),
|
937
|
+
buckets=bucket_io,
|
938
|
+
)
|
939
|
+
|
940
|
+
self.histogram_prefetch_bandwidth = Histogram(
|
941
|
+
name="sglang:prefetch_bandwidth",
|
942
|
+
documentation="Histogram of prefetch bandwidth in GB/s.",
|
943
|
+
labelnames=labels.keys(),
|
944
|
+
buckets=bucket_bandwidth,
|
945
|
+
)
|
946
|
+
|
947
|
+
self.histogram_backup_bandwidth = Histogram(
|
948
|
+
name="sglang:backup_bandwidth",
|
949
|
+
documentation="Histogram of backup bandwidth in GB/s.",
|
950
|
+
labelnames=labels.keys(),
|
951
|
+
buckets=bucket_bandwidth,
|
952
|
+
)
|
953
|
+
|
954
|
+
def log_prefetched_tokens(self, prefetched_tokens: int):
|
955
|
+
if prefetched_tokens > 0:
|
956
|
+
self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
|
957
|
+
|
958
|
+
def log_backuped_tokens(self, backuped_tokens: int):
|
959
|
+
if backuped_tokens > 0:
|
960
|
+
self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
|
961
|
+
|
962
|
+
def _log_histogram(self, histogram, data: Union[int, float]):
|
963
|
+
histogram.labels(**self.labels).observe(data)
|
964
|
+
|
965
|
+
def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
|
966
|
+
if storage_metrics is None:
|
967
|
+
return
|
968
|
+
|
969
|
+
assert isinstance(storage_metrics, StorageMetrics)
|
970
|
+
|
971
|
+
for v in storage_metrics.prefetch_pgs:
|
972
|
+
self._log_histogram(self.histogram_prefetch_pgs, v)
|
973
|
+
for v in storage_metrics.backup_pgs:
|
974
|
+
self._log_histogram(self.histogram_backup_pgs, v)
|
975
|
+
for v in storage_metrics.prefetch_bandwidth:
|
976
|
+
self._log_histogram(self.histogram_prefetch_bandwidth, v)
|
977
|
+
for v in storage_metrics.backup_bandwidth:
|
978
|
+
self._log_histogram(self.histogram_backup_bandwidth, v)
|