sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +24 -3
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +85 -54
- sglang/srt/entrypoints/openai/protocol.py +4 -1
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +36 -16
- sglang/srt/entrypoints/openai/serving_completions.py +12 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +6 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +147 -47
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +64 -40
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +158 -160
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +187 -39
- sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +259 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/hicache_storage.py +3 -23
- sglang/srt/mem_cache/hiradix_cache.py +103 -43
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +105 -46
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +493 -76
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +59 -2
- sglang/srt/model_executor/model_runner.py +356 -29
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +109 -15
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +1 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +56 -12
- sglang/srt/models/qwen3_moe.py +1 -1
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +276 -35
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +43 -4
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +34 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +11 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
- sglang/srt/disaggregation/launch_lb.py +0 -118
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
sglang/srt/metrics/collector.py
CHANGED
@@ -12,9 +12,8 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
"""Utilities for Prometheus Metrics Collection."""
|
15
|
-
|
16
15
|
import time
|
17
|
-
from dataclasses import dataclass
|
16
|
+
from dataclasses import dataclass, field
|
18
17
|
from enum import Enum
|
19
18
|
from typing import Dict, List, Optional, Union
|
20
19
|
|
@@ -50,6 +49,9 @@ class TimeStats:
|
|
50
49
|
DECODE = "decode"
|
51
50
|
INVALID = "invalid"
|
52
51
|
|
52
|
+
def get_queueing_time(self) -> float:
|
53
|
+
return self.forward_entry_time - self.wait_queue_entry_time
|
54
|
+
|
53
55
|
def __str__(self) -> str:
|
54
56
|
# if unified
|
55
57
|
_type = self.get_type()
|
@@ -134,27 +136,48 @@ class TimeStats:
|
|
134
136
|
|
135
137
|
@dataclass
|
136
138
|
class SchedulerStats:
|
139
|
+
# Basics
|
137
140
|
num_running_reqs: int = 0
|
138
141
|
num_used_tokens: int = 0
|
139
142
|
token_usage: float = 0.0
|
143
|
+
swa_token_usage: float = 0.0
|
140
144
|
gen_throughput: float = 0.0
|
141
145
|
num_queue_reqs: int = 0
|
142
|
-
cache_hit_rate: float = 0.0
|
143
146
|
num_grammar_queue_reqs: int = 0
|
144
|
-
|
147
|
+
num_running_reqs_offline_batch: int = 0
|
145
148
|
avg_request_queue_latency: float = 0.0
|
149
|
+
cache_hit_rate: float = 0.0
|
150
|
+
|
151
|
+
# Speculative decoding
|
152
|
+
spec_accept_length: float = 0.0
|
153
|
+
|
154
|
+
# PD disaggregation
|
146
155
|
num_prefill_prealloc_queue_reqs: int = 0
|
147
156
|
num_prefill_inflight_queue_reqs: int = 0
|
148
157
|
num_decode_prealloc_queue_reqs: int = 0
|
149
158
|
num_decode_transfer_queue_reqs: int = 0
|
159
|
+
kv_transfer_speed_gb_s: float = 0.0
|
160
|
+
kv_transfer_latency_ms: float = 0.0
|
161
|
+
|
162
|
+
# Retract
|
150
163
|
total_retracted_reqs: int = 0
|
164
|
+
num_retracted_reqs: int = 0
|
165
|
+
num_paused_reqs: int = 0
|
166
|
+
|
167
|
+
# Utilization
|
168
|
+
utilization: float = 0.0
|
169
|
+
max_running_requests_under_SLO: Optional[int] = None
|
170
|
+
|
171
|
+
# Engine startup
|
172
|
+
engine_startup_time: float = 0.0
|
173
|
+
engine_load_weights_time: float = 0.0
|
151
174
|
|
152
175
|
|
153
176
|
class SchedulerMetricsCollector:
|
154
177
|
|
155
178
|
def __init__(self, labels: Dict[str, str]) -> None:
|
156
179
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
157
|
-
from prometheus_client import Counter, Gauge
|
180
|
+
from prometheus_client import Counter, Gauge, Histogram
|
158
181
|
|
159
182
|
self.labels = labels
|
160
183
|
self.last_log_time = time.perf_counter()
|
@@ -165,115 +188,338 @@ class SchedulerMetricsCollector:
|
|
165
188
|
labelnames=labels.keys(),
|
166
189
|
multiprocess_mode="mostrecent",
|
167
190
|
)
|
168
|
-
|
169
191
|
self.num_used_tokens = Gauge(
|
170
192
|
name="sglang:num_used_tokens",
|
171
193
|
documentation="The number of used tokens.",
|
172
194
|
labelnames=labels.keys(),
|
173
195
|
multiprocess_mode="mostrecent",
|
174
196
|
)
|
175
|
-
|
176
197
|
self.token_usage = Gauge(
|
177
198
|
name="sglang:token_usage",
|
178
199
|
documentation="The token usage.",
|
179
200
|
labelnames=labels.keys(),
|
180
201
|
multiprocess_mode="mostrecent",
|
181
202
|
)
|
182
|
-
|
203
|
+
self.swa_token_usage = Gauge(
|
204
|
+
name="sglang:swa_token_usage",
|
205
|
+
documentation="The token usage for SWA layers.",
|
206
|
+
labelnames=labels.keys(),
|
207
|
+
multiprocess_mode="mostrecent",
|
208
|
+
)
|
183
209
|
self.gen_throughput = Gauge(
|
184
210
|
name="sglang:gen_throughput",
|
185
211
|
documentation="The generation throughput (token/s).",
|
186
212
|
labelnames=labels.keys(),
|
187
213
|
multiprocess_mode="mostrecent",
|
188
214
|
)
|
189
|
-
|
190
215
|
self.num_queue_reqs = Gauge(
|
191
216
|
name="sglang:num_queue_reqs",
|
192
217
|
documentation="The number of requests in the waiting queue.",
|
193
218
|
labelnames=labels.keys(),
|
194
219
|
multiprocess_mode="mostrecent",
|
195
220
|
)
|
196
|
-
|
197
221
|
self.num_grammar_queue_reqs = Gauge(
|
198
222
|
name="sglang:num_grammar_queue_reqs",
|
199
223
|
documentation="The number of requests in the grammar waiting queue.",
|
200
224
|
labelnames=labels.keys(),
|
201
225
|
multiprocess_mode="mostrecent",
|
202
226
|
)
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
documentation="The prefix cache hit rate.",
|
207
|
-
labelnames=labels.keys(),
|
208
|
-
multiprocess_mode="mostrecent",
|
209
|
-
)
|
210
|
-
|
211
|
-
self.spec_accept_length = Gauge(
|
212
|
-
name="sglang:spec_accept_length",
|
213
|
-
documentation="The average acceptance length of speculative decoding.",
|
227
|
+
self.num_running_reqs_offline_batch = Gauge(
|
228
|
+
name="sglang:num_running_reqs_offline_batch",
|
229
|
+
documentation="The number of running low-priority offline batch requests(label is 'batch').",
|
214
230
|
labelnames=labels.keys(),
|
215
231
|
multiprocess_mode="mostrecent",
|
216
232
|
)
|
217
|
-
|
218
233
|
self.avg_request_queue_latency = Gauge(
|
219
234
|
name="sglang:avg_request_queue_latency",
|
220
235
|
documentation="The average request queue latency for the last batch of requests in seconds.",
|
221
236
|
labelnames=labels.keys(),
|
222
237
|
multiprocess_mode="mostrecent",
|
223
238
|
)
|
239
|
+
self.cache_hit_rate = Gauge(
|
240
|
+
name="sglang:cache_hit_rate",
|
241
|
+
documentation="The prefix cache hit rate.",
|
242
|
+
labelnames=labels.keys(),
|
243
|
+
multiprocess_mode="mostrecent",
|
244
|
+
)
|
224
245
|
|
225
|
-
|
226
|
-
|
227
|
-
|
246
|
+
# Speculative decoding
|
247
|
+
self.spec_accept_length = Gauge(
|
248
|
+
name="sglang:spec_accept_length",
|
249
|
+
documentation="The average acceptance length of speculative decoding.",
|
228
250
|
labelnames=labels.keys(),
|
229
251
|
multiprocess_mode="mostrecent",
|
230
252
|
)
|
231
253
|
|
232
|
-
#
|
254
|
+
# PD disaggregation
|
233
255
|
self.num_prefill_prealloc_queue_reqs = Gauge(
|
234
256
|
name="sglang:num_prefill_prealloc_queue_reqs",
|
235
257
|
documentation="The number of requests in the prefill prealloc queue.",
|
236
258
|
labelnames=labels.keys(),
|
237
259
|
multiprocess_mode="mostrecent",
|
238
260
|
)
|
239
|
-
|
240
261
|
self.num_prefill_inflight_queue_reqs = Gauge(
|
241
262
|
name="sglang:num_prefill_inflight_queue_reqs",
|
242
263
|
documentation="The number of requests in the prefill inflight queue.",
|
243
264
|
labelnames=labels.keys(),
|
244
265
|
multiprocess_mode="mostrecent",
|
245
266
|
)
|
246
|
-
|
247
267
|
self.num_decode_prealloc_queue_reqs = Gauge(
|
248
268
|
name="sglang:num_decode_prealloc_queue_reqs",
|
249
269
|
documentation="The number of requests in the decode prealloc queue.",
|
250
270
|
labelnames=labels.keys(),
|
251
271
|
multiprocess_mode="mostrecent",
|
252
272
|
)
|
253
|
-
|
254
273
|
self.num_decode_transfer_queue_reqs = Gauge(
|
255
274
|
name="sglang:num_decode_transfer_queue_reqs",
|
256
275
|
documentation="The number of requests in the decode transfer queue.",
|
257
276
|
labelnames=labels.keys(),
|
258
277
|
multiprocess_mode="mostrecent",
|
259
278
|
)
|
260
|
-
|
261
279
|
self.num_bootstrap_failed_reqs = Counter(
|
262
|
-
name="sglang:
|
280
|
+
name="sglang:num_bootstrap_failed_reqs_total",
|
263
281
|
documentation="The number of bootstrap failed requests.",
|
264
282
|
labelnames=labels.keys(),
|
265
283
|
)
|
266
|
-
|
267
284
|
self.num_transfer_failed_reqs = Counter(
|
268
|
-
name="sglang:
|
285
|
+
name="sglang:num_transfer_failed_reqs_total",
|
269
286
|
documentation="The number of transfer failed requests.",
|
270
287
|
labelnames=labels.keys(),
|
271
288
|
)
|
289
|
+
self.kv_transfer_speed_gb_s = Gauge(
|
290
|
+
name="sglang:kv_transfer_speed_gb_s",
|
291
|
+
documentation="The transfer speed of the KV cache in GB/s.",
|
292
|
+
labelnames=labels.keys(),
|
293
|
+
multiprocess_mode="mostrecent",
|
294
|
+
)
|
295
|
+
self.kv_transfer_latency_ms = Gauge(
|
296
|
+
name="sglang:kv_transfer_latency_ms",
|
297
|
+
documentation="The transfer latency of the KV cache in ms.",
|
298
|
+
labelnames=labels.keys(),
|
299
|
+
multiprocess_mode="mostrecent",
|
300
|
+
)
|
301
|
+
|
302
|
+
# Retract
|
303
|
+
self.total_retracted_reqs = Gauge(
|
304
|
+
name="sglang:total_retracted_reqs",
|
305
|
+
documentation="The total number of retracted requests due to kvcache full.",
|
306
|
+
labelnames=labels.keys(),
|
307
|
+
multiprocess_mode="mostrecent",
|
308
|
+
)
|
309
|
+
self.num_retracted_reqs = Gauge(
|
310
|
+
name="sglang:num_retracted_reqs",
|
311
|
+
documentation="The number of retracted requests.",
|
312
|
+
labelnames=labels.keys(),
|
313
|
+
)
|
314
|
+
self.num_paused_reqs = Gauge(
|
315
|
+
name="sglang:num_paused_reqs",
|
316
|
+
documentation="The number of paused requests by async weight sync.",
|
317
|
+
labelnames=labels.keys(),
|
318
|
+
)
|
319
|
+
|
320
|
+
# Utilization
|
321
|
+
self.utilization = Gauge(
|
322
|
+
name="sglang:utilization",
|
323
|
+
documentation="The utilization.",
|
324
|
+
labelnames=labels.keys(),
|
325
|
+
multiprocess_mode="mostrecent",
|
326
|
+
)
|
327
|
+
self.max_running_requests_under_SLO = Gauge(
|
328
|
+
name="sglang:max_running_requests_under_SLO",
|
329
|
+
documentation="The maximum number of running requests under SLO.",
|
330
|
+
labelnames=labels.keys(),
|
331
|
+
multiprocess_mode="mostrecent",
|
332
|
+
)
|
333
|
+
|
334
|
+
# Engine startup
|
335
|
+
self.engine_startup_time = Gauge(
|
336
|
+
name="sglang:engine_startup_time",
|
337
|
+
documentation="The time taken for the engine to start up.",
|
338
|
+
labelnames=labels.keys(),
|
339
|
+
multiprocess_mode="mostrecent",
|
340
|
+
)
|
341
|
+
self.engine_load_weights_time = Gauge(
|
342
|
+
name="sglang:engine_load_weights_time",
|
343
|
+
documentation="The time taken for the engine to load weights.",
|
344
|
+
labelnames=labels.keys(),
|
345
|
+
multiprocess_mode="mostrecent",
|
346
|
+
)
|
347
|
+
|
348
|
+
# Additional queueing time histogram
|
349
|
+
self.queue_time = Histogram(
|
350
|
+
name="sglang:queue_time_s",
|
351
|
+
documentation="Histogram of queueing time in seconds.",
|
352
|
+
labelnames=labels.keys(),
|
353
|
+
buckets=[
|
354
|
+
0.0,
|
355
|
+
0.1,
|
356
|
+
0.2,
|
357
|
+
0.5,
|
358
|
+
1,
|
359
|
+
2,
|
360
|
+
3,
|
361
|
+
4,
|
362
|
+
5,
|
363
|
+
10,
|
364
|
+
15,
|
365
|
+
20,
|
366
|
+
30,
|
367
|
+
40,
|
368
|
+
50,
|
369
|
+
60,
|
370
|
+
70,
|
371
|
+
80,
|
372
|
+
90,
|
373
|
+
100,
|
374
|
+
200,
|
375
|
+
300,
|
376
|
+
400,
|
377
|
+
500,
|
378
|
+
600,
|
379
|
+
700,
|
380
|
+
800,
|
381
|
+
900,
|
382
|
+
1000,
|
383
|
+
1200,
|
384
|
+
1400,
|
385
|
+
1600,
|
386
|
+
1800,
|
387
|
+
2000,
|
388
|
+
2500,
|
389
|
+
3000,
|
390
|
+
],
|
391
|
+
)
|
392
|
+
|
393
|
+
# Grammar metrics
|
394
|
+
self.grammar_compilation_time = Histogram(
|
395
|
+
name="sglang:grammar_compilation_time_seconds",
|
396
|
+
documentation="Histogram of grammar compilation time in seconds.",
|
397
|
+
labelnames=labels.keys(),
|
398
|
+
buckets=[
|
399
|
+
0.0,
|
400
|
+
0.01,
|
401
|
+
0.02,
|
402
|
+
0.05,
|
403
|
+
0.1,
|
404
|
+
0.2,
|
405
|
+
0.5,
|
406
|
+
1,
|
407
|
+
2,
|
408
|
+
5,
|
409
|
+
10,
|
410
|
+
20,
|
411
|
+
30,
|
412
|
+
60,
|
413
|
+
90,
|
414
|
+
120,
|
415
|
+
240,
|
416
|
+
],
|
417
|
+
)
|
418
|
+
self.num_grammar_cache_hit = Counter(
|
419
|
+
name="sglang:num_grammar_cache_hit_total",
|
420
|
+
documentation="Number of grammar cache hits.",
|
421
|
+
labelnames=labels.keys(),
|
422
|
+
)
|
423
|
+
self.num_grammar_aborted = Counter(
|
424
|
+
name="sglang:num_grammar_aborted_total",
|
425
|
+
documentation="Number of grammar aborted requests.",
|
426
|
+
labelnames=labels.keys(),
|
427
|
+
)
|
428
|
+
self.num_grammar_total = Counter(
|
429
|
+
name="sglang:num_grammar_total",
|
430
|
+
documentation="Number of the total grammar requests.",
|
431
|
+
labelnames=labels.keys(),
|
432
|
+
)
|
433
|
+
self.grammar_schema_count = Histogram(
|
434
|
+
name="sglang:grammar_schema_count",
|
435
|
+
documentation="Histogram of grammar schema count.",
|
436
|
+
labelnames=labels.keys(),
|
437
|
+
buckets=[
|
438
|
+
0,
|
439
|
+
1,
|
440
|
+
2,
|
441
|
+
5,
|
442
|
+
10,
|
443
|
+
20,
|
444
|
+
30,
|
445
|
+
40,
|
446
|
+
60,
|
447
|
+
80,
|
448
|
+
100,
|
449
|
+
120,
|
450
|
+
140,
|
451
|
+
160,
|
452
|
+
180,
|
453
|
+
200,
|
454
|
+
300,
|
455
|
+
400,
|
456
|
+
500,
|
457
|
+
700,
|
458
|
+
1000,
|
459
|
+
],
|
460
|
+
)
|
461
|
+
self.grammar_ebnf_size = Histogram(
|
462
|
+
name="sglang:grammar_ebnf_size",
|
463
|
+
documentation="Histogram of grammar EBNF size.",
|
464
|
+
labelnames=labels.keys(),
|
465
|
+
buckets=[
|
466
|
+
0,
|
467
|
+
50,
|
468
|
+
100,
|
469
|
+
200,
|
470
|
+
300,
|
471
|
+
500,
|
472
|
+
1000,
|
473
|
+
2000,
|
474
|
+
3000,
|
475
|
+
5000,
|
476
|
+
10000,
|
477
|
+
20000,
|
478
|
+
30000,
|
479
|
+
50000,
|
480
|
+
100000,
|
481
|
+
],
|
482
|
+
)
|
483
|
+
|
484
|
+
tree_traversal_time_buckets = [
|
485
|
+
0.0,
|
486
|
+
0.01,
|
487
|
+
0.02,
|
488
|
+
0.05,
|
489
|
+
0.1,
|
490
|
+
0.2,
|
491
|
+
0.5,
|
492
|
+
1,
|
493
|
+
2,
|
494
|
+
5,
|
495
|
+
10,
|
496
|
+
15,
|
497
|
+
30,
|
498
|
+
60,
|
499
|
+
90,
|
500
|
+
120,
|
501
|
+
240,
|
502
|
+
]
|
503
|
+
self.grammar_tree_traversal_time_avg = Histogram(
|
504
|
+
name="sglang:grammar_tree_traversal_time_avg",
|
505
|
+
documentation="Histogram of average grammar tree traversal time in seconds.",
|
506
|
+
labelnames=labels.keys(),
|
507
|
+
buckets=tree_traversal_time_buckets,
|
508
|
+
)
|
509
|
+
self.grammar_tree_traversal_time_max = Histogram(
|
510
|
+
name="sglang:grammar_tree_traversal_time_max",
|
511
|
+
documentation="Histogram of max grammar tree traversal time in seconds.",
|
512
|
+
labelnames=labels.keys(),
|
513
|
+
buckets=tree_traversal_time_buckets,
|
514
|
+
)
|
272
515
|
|
273
516
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
274
517
|
# Convenience function for logging to gauge.
|
275
518
|
gauge.labels(**self.labels).set(data)
|
276
519
|
|
520
|
+
def log_histogram(self, histogram, data: Union[int, float]) -> None:
|
521
|
+
histogram.labels(**self.labels).observe(data)
|
522
|
+
|
277
523
|
def increment_bootstrap_failed_reqs(self) -> None:
|
278
524
|
self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
|
279
525
|
|
@@ -284,14 +530,20 @@ class SchedulerMetricsCollector:
|
|
284
530
|
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
|
285
531
|
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
|
286
532
|
self._log_gauge(self.token_usage, stats.token_usage)
|
533
|
+
self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
|
287
534
|
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
288
535
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
289
536
|
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
537
|
+
self._log_gauge(
|
538
|
+
self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
|
539
|
+
)
|
290
540
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
541
|
+
self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
|
542
|
+
|
543
|
+
# Speculative decoding
|
291
544
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
292
|
-
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
293
545
|
|
294
|
-
#
|
546
|
+
# PD disaggregation
|
295
547
|
self._log_gauge(
|
296
548
|
self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
|
297
549
|
)
|
@@ -304,15 +556,59 @@ class SchedulerMetricsCollector:
|
|
304
556
|
self._log_gauge(
|
305
557
|
self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
|
306
558
|
)
|
559
|
+
self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
|
560
|
+
self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
|
561
|
+
|
562
|
+
# Retract
|
563
|
+
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
564
|
+
self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
|
565
|
+
self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
|
566
|
+
|
567
|
+
# Utilization
|
568
|
+
self._log_gauge(self.utilization, stats.utilization)
|
569
|
+
if stats.max_running_requests_under_SLO is not None:
|
570
|
+
self._log_gauge(
|
571
|
+
self.max_running_requests_under_SLO,
|
572
|
+
stats.max_running_requests_under_SLO,
|
573
|
+
)
|
574
|
+
|
575
|
+
# Engine startup time
|
576
|
+
self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
|
577
|
+
if stats.engine_load_weights_time is not None:
|
578
|
+
self._log_gauge(
|
579
|
+
self.engine_load_weights_time, stats.engine_load_weights_time
|
580
|
+
)
|
307
581
|
|
308
582
|
self.last_log_time = time.perf_counter()
|
309
583
|
|
584
|
+
def log_grammar_stats(self, grammar_stats) -> None:
|
585
|
+
# Duck-typed GrammarStats to avoid cross-package dependency
|
586
|
+
if getattr(grammar_stats, "compilation_time", None) is not None:
|
587
|
+
self.log_histogram(
|
588
|
+
self.grammar_compilation_time, grammar_stats.compilation_time
|
589
|
+
)
|
590
|
+
if getattr(grammar_stats, "schema_count", None) is not None:
|
591
|
+
self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
|
592
|
+
if getattr(grammar_stats, "ebnf_size", None) is not None:
|
593
|
+
self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
|
594
|
+
tree_times = getattr(grammar_stats, "tree_traversal_time", None)
|
595
|
+
if tree_times:
|
596
|
+
max_time = max(tree_times)
|
597
|
+
avg_time = sum(tree_times) / len(tree_times)
|
598
|
+
self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
|
599
|
+
self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
|
600
|
+
if getattr(grammar_stats, "is_cache_hit", False):
|
601
|
+
self.num_grammar_cache_hit.labels(**self.labels).inc(1)
|
602
|
+
if getattr(grammar_stats, "is_grammar_aborted", False):
|
603
|
+
self.num_grammar_aborted.labels(**self.labels).inc(1)
|
604
|
+
self.num_grammar_total.labels(**self.labels).inc(1)
|
605
|
+
|
310
606
|
|
311
607
|
class TokenizerMetricsCollector:
|
312
608
|
def __init__(
|
313
609
|
self,
|
314
|
-
server_args: ServerArgs,
|
315
|
-
labels: Dict[str, str],
|
610
|
+
server_args: Optional[ServerArgs] = None,
|
611
|
+
labels: Dict[str, str] = None,
|
316
612
|
bucket_time_to_first_token: Optional[List[float]] = None,
|
317
613
|
bucket_inter_token_latency: Optional[List[float]] = None,
|
318
614
|
bucket_e2e_request_latency: Optional[List[float]] = None,
|
@@ -321,7 +617,7 @@ class TokenizerMetricsCollector:
|
|
321
617
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
322
618
|
from prometheus_client import Counter, Histogram
|
323
619
|
|
324
|
-
self.labels = labels
|
620
|
+
self.labels = labels or {}
|
325
621
|
self.collect_tokens_histogram = collect_tokens_histogram
|
326
622
|
|
327
623
|
self.prompt_tokens_total = Counter(
|
@@ -361,6 +657,13 @@ class TokenizerMetricsCollector:
|
|
361
657
|
30000,
|
362
658
|
35000,
|
363
659
|
40000,
|
660
|
+
66000,
|
661
|
+
99000,
|
662
|
+
132000,
|
663
|
+
300000,
|
664
|
+
600000,
|
665
|
+
900000,
|
666
|
+
1100000,
|
364
667
|
]
|
365
668
|
self.prompt_tokens_histogram = Histogram(
|
366
669
|
name="sglang:prompt_tokens_histogram",
|
@@ -370,34 +673,13 @@ class TokenizerMetricsCollector:
|
|
370
673
|
server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
|
371
674
|
),
|
372
675
|
)
|
373
|
-
default_bucket_generation_tokens = [
|
374
|
-
100,
|
375
|
-
300,
|
376
|
-
500,
|
377
|
-
1000,
|
378
|
-
1200,
|
379
|
-
1500,
|
380
|
-
1700,
|
381
|
-
2000,
|
382
|
-
2500,
|
383
|
-
3000,
|
384
|
-
3500,
|
385
|
-
4000,
|
386
|
-
4500,
|
387
|
-
5000,
|
388
|
-
6000,
|
389
|
-
7000,
|
390
|
-
8000,
|
391
|
-
9000,
|
392
|
-
10000,
|
393
|
-
]
|
394
676
|
self.generation_tokens_histogram = Histogram(
|
395
677
|
name="sglang:generation_tokens_histogram",
|
396
678
|
documentation="Histogram of generation token length.",
|
397
679
|
labelnames=labels.keys(),
|
398
680
|
buckets=generate_buckets(
|
399
681
|
server_args.generation_tokens_buckets,
|
400
|
-
|
682
|
+
default_bucket_prompt_tokens,
|
401
683
|
),
|
402
684
|
)
|
403
685
|
|
@@ -467,7 +749,10 @@ class TokenizerMetricsCollector:
|
|
467
749
|
100,
|
468
750
|
200,
|
469
751
|
400,
|
470
|
-
|
752
|
+
600,
|
753
|
+
1200,
|
754
|
+
1800,
|
755
|
+
2400,
|
471
756
|
]
|
472
757
|
|
473
758
|
if bucket_inter_token_latency is None:
|
@@ -518,38 +803,68 @@ class TokenizerMetricsCollector:
|
|
518
803
|
buckets=bucket_e2e_request_latency,
|
519
804
|
)
|
520
805
|
|
521
|
-
|
522
|
-
|
806
|
+
# Offline batch specific TTFB histogram
|
807
|
+
self.histogram_time_to_first_token_offline_batch = Histogram(
|
808
|
+
name="sglang:time_to_first_token_seconds_offline_batch",
|
809
|
+
documentation="Histogram of time to first token in seconds for offline batch requests.",
|
810
|
+
labelnames=labels.keys(),
|
811
|
+
buckets=bucket_time_to_first_token,
|
812
|
+
)
|
523
813
|
|
524
814
|
def observe_one_finished_request(
|
525
815
|
self,
|
816
|
+
labels: Dict[str, str],
|
526
817
|
prompt_tokens: int,
|
527
818
|
generation_tokens: int,
|
528
819
|
cached_tokens: int,
|
529
820
|
e2e_latency: float,
|
530
821
|
has_grammar: bool,
|
531
822
|
):
|
532
|
-
self.prompt_tokens_total.labels(**
|
533
|
-
self.generation_tokens_total.labels(**
|
823
|
+
self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
|
824
|
+
self.generation_tokens_total.labels(**labels).inc(generation_tokens)
|
534
825
|
if cached_tokens > 0:
|
535
|
-
self.cached_tokens_total.labels(**
|
536
|
-
self.num_requests_total.labels(**
|
826
|
+
self.cached_tokens_total.labels(**labels).inc(cached_tokens)
|
827
|
+
self.num_requests_total.labels(**labels).inc(1)
|
537
828
|
if has_grammar:
|
538
|
-
self.num_so_requests_total.labels(**
|
539
|
-
self.
|
829
|
+
self.num_so_requests_total.labels(**labels).inc(1)
|
830
|
+
self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
|
540
831
|
if self.collect_tokens_histogram:
|
541
|
-
self.
|
542
|
-
self.
|
543
|
-
|
544
|
-
|
545
|
-
self.histogram_time_to_first_token.labels(**self.labels).observe(value)
|
832
|
+
self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
|
833
|
+
self.generation_tokens_histogram.labels(**labels).observe(
|
834
|
+
float(generation_tokens)
|
835
|
+
)
|
546
836
|
|
547
|
-
def
|
837
|
+
def observe_time_to_first_token(
|
838
|
+
self, labels: Dict[str, str], value: float, type: str = ""
|
839
|
+
):
|
840
|
+
if type == "batch":
|
841
|
+
self.histogram_time_to_first_token_offline_batch.labels(**labels).observe(
|
842
|
+
value
|
843
|
+
)
|
844
|
+
else:
|
845
|
+
self.histogram_time_to_first_token.labels(**labels).observe(value)
|
846
|
+
|
847
|
+
def check_time_to_first_token_straggler(self, value: float) -> bool:
|
848
|
+
his = self.histogram_time_to_first_token.labels(**self.labels)
|
849
|
+
total_observations = sum(bucket._value for bucket in his._buckets)
|
850
|
+
if total_observations < 100:
|
851
|
+
return False
|
852
|
+
p99_threshold = total_observations * 0.99
|
853
|
+
cumulative_count = 0
|
854
|
+
for i, bucket in enumerate(his._buckets):
|
855
|
+
cumulative_count += bucket._value
|
856
|
+
if cumulative_count > p99_threshold:
|
857
|
+
return value >= his._upper_bounds[i]
|
858
|
+
return False
|
859
|
+
|
860
|
+
def observe_inter_token_latency(
|
861
|
+
self, labels: Dict[str, str], internval: float, num_new_tokens: int
|
862
|
+
):
|
548
863
|
adjusted_interval = internval / num_new_tokens
|
549
864
|
|
550
865
|
# A faster version of the Histogram::observe which observes multiple values at the same time.
|
551
866
|
# reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
|
552
|
-
his = self.histogram_inter_token_latency_seconds.labels(**
|
867
|
+
his = self.histogram_inter_token_latency_seconds.labels(**labels)
|
553
868
|
his._sum.inc(internval)
|
554
869
|
|
555
870
|
for i, bound in enumerate(his._upper_bounds):
|
@@ -559,3 +874,105 @@ class TokenizerMetricsCollector:
|
|
559
874
|
|
560
875
|
def observe_one_aborted_request(self):
|
561
876
|
self.num_aborted_requests_total.labels(**self.labels).inc(1)
|
877
|
+
|
878
|
+
|
879
|
+
@dataclass
|
880
|
+
class StorageMetrics:
|
881
|
+
prefetch_pgs: List[int] = field(default_factory=list)
|
882
|
+
backup_pgs: List[int] = field(default_factory=list)
|
883
|
+
prefetch_bandwidth: List[float] = field(default_factory=list)
|
884
|
+
backup_bandwidth: List[float] = field(default_factory=list)
|
885
|
+
|
886
|
+
|
887
|
+
class StorageMetricsCollector:
|
888
|
+
def __init__(
|
889
|
+
self,
|
890
|
+
labels: Dict[str, str],
|
891
|
+
):
|
892
|
+
from prometheus_client import Counter, Histogram
|
893
|
+
|
894
|
+
self.labels = labels
|
895
|
+
|
896
|
+
self.prefetched_tokens_total = Counter(
|
897
|
+
name="sglang:prefetched_tokens_total",
|
898
|
+
documentation="Number of prefetched prompt tokens.",
|
899
|
+
labelnames=labels.keys(),
|
900
|
+
)
|
901
|
+
|
902
|
+
self.backuped_tokens_total = Counter(
|
903
|
+
name="sglang:backuped_tokens_total",
|
904
|
+
documentation="Number of backuped tokens.",
|
905
|
+
labelnames=labels.keys(),
|
906
|
+
)
|
907
|
+
|
908
|
+
bucket_io = [
|
909
|
+
1,
|
910
|
+
5,
|
911
|
+
10,
|
912
|
+
50,
|
913
|
+
100,
|
914
|
+
]
|
915
|
+
|
916
|
+
bucket_bandwidth = [
|
917
|
+
0.1,
|
918
|
+
0.5,
|
919
|
+
1,
|
920
|
+
5,
|
921
|
+
10,
|
922
|
+
50,
|
923
|
+
100,
|
924
|
+
]
|
925
|
+
|
926
|
+
self.histogram_prefetch_pgs = Histogram(
|
927
|
+
name="sglang:prefetch_pgs",
|
928
|
+
documentation="Histogram of prefetch pages of batches.",
|
929
|
+
labelnames=labels.keys(),
|
930
|
+
buckets=bucket_io,
|
931
|
+
)
|
932
|
+
|
933
|
+
self.histogram_backup_pgs = Histogram(
|
934
|
+
name="sglang:backup_pgs",
|
935
|
+
documentation="Histogram of backup pages of batches.",
|
936
|
+
labelnames=labels.keys(),
|
937
|
+
buckets=bucket_io,
|
938
|
+
)
|
939
|
+
|
940
|
+
self.histogram_prefetch_bandwidth = Histogram(
|
941
|
+
name="sglang:prefetch_bandwidth",
|
942
|
+
documentation="Histogram of prefetch bandwidth in GB/s.",
|
943
|
+
labelnames=labels.keys(),
|
944
|
+
buckets=bucket_bandwidth,
|
945
|
+
)
|
946
|
+
|
947
|
+
self.histogram_backup_bandwidth = Histogram(
|
948
|
+
name="sglang:backup_bandwidth",
|
949
|
+
documentation="Histogram of backup bandwidth in GB/s.",
|
950
|
+
labelnames=labels.keys(),
|
951
|
+
buckets=bucket_bandwidth,
|
952
|
+
)
|
953
|
+
|
954
|
+
def log_prefetched_tokens(self, prefetched_tokens: int):
|
955
|
+
if prefetched_tokens > 0:
|
956
|
+
self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
|
957
|
+
|
958
|
+
def log_backuped_tokens(self, backuped_tokens: int):
|
959
|
+
if backuped_tokens > 0:
|
960
|
+
self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
|
961
|
+
|
962
|
+
def _log_histogram(self, histogram, data: Union[int, float]):
|
963
|
+
histogram.labels(**self.labels).observe(data)
|
964
|
+
|
965
|
+
def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
|
966
|
+
if storage_metrics is None:
|
967
|
+
return
|
968
|
+
|
969
|
+
assert isinstance(storage_metrics, StorageMetrics)
|
970
|
+
|
971
|
+
for v in storage_metrics.prefetch_pgs:
|
972
|
+
self._log_histogram(self.histogram_prefetch_pgs, v)
|
973
|
+
for v in storage_metrics.backup_pgs:
|
974
|
+
self._log_histogram(self.histogram_backup_pgs, v)
|
975
|
+
for v in storage_metrics.prefetch_bandwidth:
|
976
|
+
self._log_histogram(self.histogram_prefetch_bandwidth, v)
|
977
|
+
for v in storage_metrics.backup_bandwidth:
|
978
|
+
self._log_histogram(self.histogram_backup_bandwidth, v)
|