sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +89 -54
- sglang/bench_serving.py +437 -40
- sglang/lang/interpreter.py +1 -1
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +37 -7
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +6 -4
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -420
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +6 -4
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +94 -58
- sglang/srt/entrypoints/engine.py +34 -14
- sglang/srt/entrypoints/http_server.py +172 -47
- sglang/srt/entrypoints/openai/protocol.py +90 -27
- sglang/srt/entrypoints/openai/serving_base.py +6 -2
- sglang/srt/entrypoints/openai/serving_chat.py +82 -26
- sglang/srt/entrypoints/openai/serving_completions.py +25 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/eplb/eplb_manager.py +28 -4
- sglang/srt/eplb/expert_distribution.py +55 -15
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +28 -7
- sglang/srt/layers/activation.py +44 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +381 -136
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +11 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
- sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -8
- sglang/srt/layers/layernorm.py +54 -12
- sglang/srt/layers/logits_processor.py +10 -3
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +111 -56
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +43 -12
- sglang/srt/layers/moe/utils.py +6 -5
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
- sglang/srt/layers/quantization/fp8.py +78 -48
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +45 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +107 -40
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +93 -68
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +60 -42
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +83 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +28 -19
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/cache_controller.py +396 -365
- sglang/srt/managers/data_parallel_controller.py +30 -15
- sglang/srt/managers/detokenizer_manager.py +18 -2
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +190 -11
- sglang/srt/managers/mm_utils.py +6 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
- sglang/srt/managers/schedule_batch.py +27 -44
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +148 -122
- sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
- sglang/srt/managers/tokenizer_manager.py +77 -480
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +53 -40
- sglang/srt/mem_cache/hiradix_cache.py +196 -104
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +395 -53
- sglang/srt/mem_cache/memory_pool_host.py +27 -19
- sglang/srt/mem_cache/radix_cache.py +6 -6
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1 -3
- sglang/srt/metrics/collector.py +484 -63
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +72 -18
- sglang/srt/model_executor/model_runner.py +190 -32
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +33 -28
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/deepseek_v2.py +323 -53
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/gpt_oss.py +7 -19
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +17 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +33 -3
- sglang/srt/models/qwen2_5_vl.py +91 -42
- sglang/srt/models/qwen2_moe.py +79 -14
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/qwen3_next.py +1039 -0
- sglang/srt/models/qwen3_next_mtp.py +109 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/{conversation.py → parser/conversation.py} +38 -5
- sglang/srt/parser/harmony_parser.py +588 -0
- sglang/srt/parser/reasoning_parser.py +309 -0
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +307 -80
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_worker.py +216 -120
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +96 -7
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +181 -8
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_utils.py +25 -1
- sglang/utils.py +5 -0
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
- sglang/srt/disaggregation/launch_lb.py +0 -131
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- sglang/srt/reasoning_parser.py +0 -553
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/srt/metrics/collector.py
CHANGED
@@ -14,10 +14,12 @@
|
|
14
14
|
"""Utilities for Prometheus Metrics Collection."""
|
15
15
|
|
16
16
|
import time
|
17
|
-
from dataclasses import dataclass
|
17
|
+
from dataclasses import dataclass, field
|
18
18
|
from enum import Enum
|
19
19
|
from typing import Dict, List, Optional, Union
|
20
20
|
|
21
|
+
from sglang.srt.metrics.utils import generate_buckets
|
22
|
+
from sglang.srt.server_args import ServerArgs
|
21
23
|
from sglang.srt.utils import get_bool_env_var
|
22
24
|
|
23
25
|
SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
|
@@ -48,6 +50,9 @@ class TimeStats:
|
|
48
50
|
DECODE = "decode"
|
49
51
|
INVALID = "invalid"
|
50
52
|
|
53
|
+
def get_queueing_time(self) -> float:
|
54
|
+
return self.forward_entry_time - self.wait_queue_entry_time
|
55
|
+
|
51
56
|
def __str__(self) -> str:
|
52
57
|
# if unified
|
53
58
|
_type = self.get_type()
|
@@ -132,27 +137,48 @@ class TimeStats:
|
|
132
137
|
|
133
138
|
@dataclass
|
134
139
|
class SchedulerStats:
|
140
|
+
# Basics
|
135
141
|
num_running_reqs: int = 0
|
136
142
|
num_used_tokens: int = 0
|
137
143
|
token_usage: float = 0.0
|
144
|
+
swa_token_usage: float = 0.0
|
138
145
|
gen_throughput: float = 0.0
|
139
146
|
num_queue_reqs: int = 0
|
140
|
-
cache_hit_rate: float = 0.0
|
141
147
|
num_grammar_queue_reqs: int = 0
|
142
|
-
|
148
|
+
num_running_reqs_offline_batch: int = 0
|
143
149
|
avg_request_queue_latency: float = 0.0
|
150
|
+
cache_hit_rate: float = 0.0
|
151
|
+
|
152
|
+
# Speculative decoding
|
153
|
+
spec_accept_length: float = 0.0
|
154
|
+
|
155
|
+
# PD disaggregation
|
144
156
|
num_prefill_prealloc_queue_reqs: int = 0
|
145
157
|
num_prefill_inflight_queue_reqs: int = 0
|
146
158
|
num_decode_prealloc_queue_reqs: int = 0
|
147
159
|
num_decode_transfer_queue_reqs: int = 0
|
160
|
+
kv_transfer_speed_gb_s: float = 0.0
|
161
|
+
kv_transfer_latency_ms: float = 0.0
|
162
|
+
|
163
|
+
# Retract
|
148
164
|
total_retracted_reqs: int = 0
|
165
|
+
num_retracted_reqs: int = 0
|
166
|
+
num_paused_reqs: int = 0
|
167
|
+
|
168
|
+
# Utilization
|
169
|
+
utilization: float = 0.0
|
170
|
+
max_running_requests_under_SLO: Optional[int] = None
|
171
|
+
|
172
|
+
# Engine startup
|
173
|
+
engine_startup_time: float = 0.0
|
174
|
+
engine_load_weights_time: float = 0.0
|
149
175
|
|
150
176
|
|
151
177
|
class SchedulerMetricsCollector:
|
152
178
|
|
153
179
|
def __init__(self, labels: Dict[str, str]) -> None:
|
154
180
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
155
|
-
from prometheus_client import Counter, Gauge
|
181
|
+
from prometheus_client import Counter, Gauge, Histogram
|
156
182
|
|
157
183
|
self.labels = labels
|
158
184
|
self.last_log_time = time.perf_counter()
|
@@ -163,115 +189,338 @@ class SchedulerMetricsCollector:
|
|
163
189
|
labelnames=labels.keys(),
|
164
190
|
multiprocess_mode="mostrecent",
|
165
191
|
)
|
166
|
-
|
167
192
|
self.num_used_tokens = Gauge(
|
168
193
|
name="sglang:num_used_tokens",
|
169
194
|
documentation="The number of used tokens.",
|
170
195
|
labelnames=labels.keys(),
|
171
196
|
multiprocess_mode="mostrecent",
|
172
197
|
)
|
173
|
-
|
174
198
|
self.token_usage = Gauge(
|
175
199
|
name="sglang:token_usage",
|
176
200
|
documentation="The token usage.",
|
177
201
|
labelnames=labels.keys(),
|
178
202
|
multiprocess_mode="mostrecent",
|
179
203
|
)
|
180
|
-
|
204
|
+
self.swa_token_usage = Gauge(
|
205
|
+
name="sglang:swa_token_usage",
|
206
|
+
documentation="The token usage for SWA layers.",
|
207
|
+
labelnames=labels.keys(),
|
208
|
+
multiprocess_mode="mostrecent",
|
209
|
+
)
|
181
210
|
self.gen_throughput = Gauge(
|
182
211
|
name="sglang:gen_throughput",
|
183
212
|
documentation="The generation throughput (token/s).",
|
184
213
|
labelnames=labels.keys(),
|
185
214
|
multiprocess_mode="mostrecent",
|
186
215
|
)
|
187
|
-
|
188
216
|
self.num_queue_reqs = Gauge(
|
189
217
|
name="sglang:num_queue_reqs",
|
190
218
|
documentation="The number of requests in the waiting queue.",
|
191
219
|
labelnames=labels.keys(),
|
192
220
|
multiprocess_mode="mostrecent",
|
193
221
|
)
|
194
|
-
|
195
222
|
self.num_grammar_queue_reqs = Gauge(
|
196
223
|
name="sglang:num_grammar_queue_reqs",
|
197
224
|
documentation="The number of requests in the grammar waiting queue.",
|
198
225
|
labelnames=labels.keys(),
|
199
226
|
multiprocess_mode="mostrecent",
|
200
227
|
)
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
documentation="The prefix cache hit rate.",
|
205
|
-
labelnames=labels.keys(),
|
206
|
-
multiprocess_mode="mostrecent",
|
207
|
-
)
|
208
|
-
|
209
|
-
self.spec_accept_length = Gauge(
|
210
|
-
name="sglang:spec_accept_length",
|
211
|
-
documentation="The average acceptance length of speculative decoding.",
|
228
|
+
self.num_running_reqs_offline_batch = Gauge(
|
229
|
+
name="sglang:num_running_reqs_offline_batch",
|
230
|
+
documentation="The number of running low-priority offline batch requests(label is 'batch').",
|
212
231
|
labelnames=labels.keys(),
|
213
232
|
multiprocess_mode="mostrecent",
|
214
233
|
)
|
215
|
-
|
216
234
|
self.avg_request_queue_latency = Gauge(
|
217
235
|
name="sglang:avg_request_queue_latency",
|
218
236
|
documentation="The average request queue latency for the last batch of requests in seconds.",
|
219
237
|
labelnames=labels.keys(),
|
220
238
|
multiprocess_mode="mostrecent",
|
221
239
|
)
|
240
|
+
self.cache_hit_rate = Gauge(
|
241
|
+
name="sglang:cache_hit_rate",
|
242
|
+
documentation="The prefix cache hit rate.",
|
243
|
+
labelnames=labels.keys(),
|
244
|
+
multiprocess_mode="mostrecent",
|
245
|
+
)
|
222
246
|
|
223
|
-
|
224
|
-
|
225
|
-
|
247
|
+
# Speculative decoding
|
248
|
+
self.spec_accept_length = Gauge(
|
249
|
+
name="sglang:spec_accept_length",
|
250
|
+
documentation="The average acceptance length of speculative decoding.",
|
226
251
|
labelnames=labels.keys(),
|
227
252
|
multiprocess_mode="mostrecent",
|
228
253
|
)
|
229
254
|
|
230
|
-
#
|
255
|
+
# PD disaggregation
|
231
256
|
self.num_prefill_prealloc_queue_reqs = Gauge(
|
232
257
|
name="sglang:num_prefill_prealloc_queue_reqs",
|
233
258
|
documentation="The number of requests in the prefill prealloc queue.",
|
234
259
|
labelnames=labels.keys(),
|
235
260
|
multiprocess_mode="mostrecent",
|
236
261
|
)
|
237
|
-
|
238
262
|
self.num_prefill_inflight_queue_reqs = Gauge(
|
239
263
|
name="sglang:num_prefill_inflight_queue_reqs",
|
240
264
|
documentation="The number of requests in the prefill inflight queue.",
|
241
265
|
labelnames=labels.keys(),
|
242
266
|
multiprocess_mode="mostrecent",
|
243
267
|
)
|
244
|
-
|
245
268
|
self.num_decode_prealloc_queue_reqs = Gauge(
|
246
269
|
name="sglang:num_decode_prealloc_queue_reqs",
|
247
270
|
documentation="The number of requests in the decode prealloc queue.",
|
248
271
|
labelnames=labels.keys(),
|
249
272
|
multiprocess_mode="mostrecent",
|
250
273
|
)
|
251
|
-
|
252
274
|
self.num_decode_transfer_queue_reqs = Gauge(
|
253
275
|
name="sglang:num_decode_transfer_queue_reqs",
|
254
276
|
documentation="The number of requests in the decode transfer queue.",
|
255
277
|
labelnames=labels.keys(),
|
256
278
|
multiprocess_mode="mostrecent",
|
257
279
|
)
|
258
|
-
|
259
280
|
self.num_bootstrap_failed_reqs = Counter(
|
260
|
-
name="sglang:
|
281
|
+
name="sglang:num_bootstrap_failed_reqs_total",
|
261
282
|
documentation="The number of bootstrap failed requests.",
|
262
283
|
labelnames=labels.keys(),
|
263
284
|
)
|
264
|
-
|
265
285
|
self.num_transfer_failed_reqs = Counter(
|
266
|
-
name="sglang:
|
286
|
+
name="sglang:num_transfer_failed_reqs_total",
|
267
287
|
documentation="The number of transfer failed requests.",
|
268
288
|
labelnames=labels.keys(),
|
269
289
|
)
|
290
|
+
self.kv_transfer_speed_gb_s = Gauge(
|
291
|
+
name="sglang:kv_transfer_speed_gb_s",
|
292
|
+
documentation="The transfer speed of the KV cache in GB/s.",
|
293
|
+
labelnames=labels.keys(),
|
294
|
+
multiprocess_mode="mostrecent",
|
295
|
+
)
|
296
|
+
self.kv_transfer_latency_ms = Gauge(
|
297
|
+
name="sglang:kv_transfer_latency_ms",
|
298
|
+
documentation="The transfer latency of the KV cache in ms.",
|
299
|
+
labelnames=labels.keys(),
|
300
|
+
multiprocess_mode="mostrecent",
|
301
|
+
)
|
302
|
+
|
303
|
+
# Retract
|
304
|
+
self.total_retracted_reqs = Gauge(
|
305
|
+
name="sglang:total_retracted_reqs",
|
306
|
+
documentation="The total number of retracted requests due to kvcache full.",
|
307
|
+
labelnames=labels.keys(),
|
308
|
+
multiprocess_mode="mostrecent",
|
309
|
+
)
|
310
|
+
self.num_retracted_reqs = Gauge(
|
311
|
+
name="sglang:num_retracted_reqs",
|
312
|
+
documentation="The number of retracted requests.",
|
313
|
+
labelnames=labels.keys(),
|
314
|
+
)
|
315
|
+
self.num_paused_reqs = Gauge(
|
316
|
+
name="sglang:num_paused_reqs",
|
317
|
+
documentation="The number of paused requests by async weight sync.",
|
318
|
+
labelnames=labels.keys(),
|
319
|
+
)
|
320
|
+
|
321
|
+
# Utilization
|
322
|
+
self.utilization = Gauge(
|
323
|
+
name="sglang:utilization",
|
324
|
+
documentation="The utilization.",
|
325
|
+
labelnames=labels.keys(),
|
326
|
+
multiprocess_mode="mostrecent",
|
327
|
+
)
|
328
|
+
self.max_running_requests_under_SLO = Gauge(
|
329
|
+
name="sglang:max_running_requests_under_SLO",
|
330
|
+
documentation="The maximum number of running requests under SLO.",
|
331
|
+
labelnames=labels.keys(),
|
332
|
+
multiprocess_mode="mostrecent",
|
333
|
+
)
|
334
|
+
|
335
|
+
# Engine startup
|
336
|
+
self.engine_startup_time = Gauge(
|
337
|
+
name="sglang:engine_startup_time",
|
338
|
+
documentation="The time taken for the engine to start up.",
|
339
|
+
labelnames=labels.keys(),
|
340
|
+
multiprocess_mode="mostrecent",
|
341
|
+
)
|
342
|
+
self.engine_load_weights_time = Gauge(
|
343
|
+
name="sglang:engine_load_weights_time",
|
344
|
+
documentation="The time taken for the engine to load weights.",
|
345
|
+
labelnames=labels.keys(),
|
346
|
+
multiprocess_mode="mostrecent",
|
347
|
+
)
|
348
|
+
|
349
|
+
# Additional queueing time histogram
|
350
|
+
self.queue_time = Histogram(
|
351
|
+
name="sglang:queue_time_s",
|
352
|
+
documentation="Histogram of queueing time in seconds.",
|
353
|
+
labelnames=labels.keys(),
|
354
|
+
buckets=[
|
355
|
+
0.0,
|
356
|
+
0.1,
|
357
|
+
0.2,
|
358
|
+
0.5,
|
359
|
+
1,
|
360
|
+
2,
|
361
|
+
3,
|
362
|
+
4,
|
363
|
+
5,
|
364
|
+
10,
|
365
|
+
15,
|
366
|
+
20,
|
367
|
+
30,
|
368
|
+
40,
|
369
|
+
50,
|
370
|
+
60,
|
371
|
+
70,
|
372
|
+
80,
|
373
|
+
90,
|
374
|
+
100,
|
375
|
+
200,
|
376
|
+
300,
|
377
|
+
400,
|
378
|
+
500,
|
379
|
+
600,
|
380
|
+
700,
|
381
|
+
800,
|
382
|
+
900,
|
383
|
+
1000,
|
384
|
+
1200,
|
385
|
+
1400,
|
386
|
+
1600,
|
387
|
+
1800,
|
388
|
+
2000,
|
389
|
+
2500,
|
390
|
+
3000,
|
391
|
+
],
|
392
|
+
)
|
393
|
+
|
394
|
+
# Grammar metrics
|
395
|
+
self.grammar_compilation_time = Histogram(
|
396
|
+
name="sglang:grammar_compilation_time_seconds",
|
397
|
+
documentation="Histogram of grammar compilation time in seconds.",
|
398
|
+
labelnames=labels.keys(),
|
399
|
+
buckets=[
|
400
|
+
0.0,
|
401
|
+
0.01,
|
402
|
+
0.02,
|
403
|
+
0.05,
|
404
|
+
0.1,
|
405
|
+
0.2,
|
406
|
+
0.5,
|
407
|
+
1,
|
408
|
+
2,
|
409
|
+
5,
|
410
|
+
10,
|
411
|
+
20,
|
412
|
+
30,
|
413
|
+
60,
|
414
|
+
90,
|
415
|
+
120,
|
416
|
+
240,
|
417
|
+
],
|
418
|
+
)
|
419
|
+
self.num_grammar_cache_hit = Counter(
|
420
|
+
name="sglang:num_grammar_cache_hit_total",
|
421
|
+
documentation="Number of grammar cache hits.",
|
422
|
+
labelnames=labels.keys(),
|
423
|
+
)
|
424
|
+
self.num_grammar_aborted = Counter(
|
425
|
+
name="sglang:num_grammar_aborted_total",
|
426
|
+
documentation="Number of grammar aborted requests.",
|
427
|
+
labelnames=labels.keys(),
|
428
|
+
)
|
429
|
+
self.num_grammar_total = Counter(
|
430
|
+
name="sglang:num_grammar_total",
|
431
|
+
documentation="Number of the total grammar requests.",
|
432
|
+
labelnames=labels.keys(),
|
433
|
+
)
|
434
|
+
self.grammar_schema_count = Histogram(
|
435
|
+
name="sglang:grammar_schema_count",
|
436
|
+
documentation="Histogram of grammar schema count.",
|
437
|
+
labelnames=labels.keys(),
|
438
|
+
buckets=[
|
439
|
+
0,
|
440
|
+
1,
|
441
|
+
2,
|
442
|
+
5,
|
443
|
+
10,
|
444
|
+
20,
|
445
|
+
30,
|
446
|
+
40,
|
447
|
+
60,
|
448
|
+
80,
|
449
|
+
100,
|
450
|
+
120,
|
451
|
+
140,
|
452
|
+
160,
|
453
|
+
180,
|
454
|
+
200,
|
455
|
+
300,
|
456
|
+
400,
|
457
|
+
500,
|
458
|
+
700,
|
459
|
+
1000,
|
460
|
+
],
|
461
|
+
)
|
462
|
+
self.grammar_ebnf_size = Histogram(
|
463
|
+
name="sglang:grammar_ebnf_size",
|
464
|
+
documentation="Histogram of grammar EBNF size.",
|
465
|
+
labelnames=labels.keys(),
|
466
|
+
buckets=[
|
467
|
+
0,
|
468
|
+
50,
|
469
|
+
100,
|
470
|
+
200,
|
471
|
+
300,
|
472
|
+
500,
|
473
|
+
1000,
|
474
|
+
2000,
|
475
|
+
3000,
|
476
|
+
5000,
|
477
|
+
10000,
|
478
|
+
20000,
|
479
|
+
30000,
|
480
|
+
50000,
|
481
|
+
100000,
|
482
|
+
],
|
483
|
+
)
|
484
|
+
|
485
|
+
tree_traversal_time_buckets = [
|
486
|
+
0.0,
|
487
|
+
0.01,
|
488
|
+
0.02,
|
489
|
+
0.05,
|
490
|
+
0.1,
|
491
|
+
0.2,
|
492
|
+
0.5,
|
493
|
+
1,
|
494
|
+
2,
|
495
|
+
5,
|
496
|
+
10,
|
497
|
+
15,
|
498
|
+
30,
|
499
|
+
60,
|
500
|
+
90,
|
501
|
+
120,
|
502
|
+
240,
|
503
|
+
]
|
504
|
+
self.grammar_tree_traversal_time_avg = Histogram(
|
505
|
+
name="sglang:grammar_tree_traversal_time_avg",
|
506
|
+
documentation="Histogram of average grammar tree traversal time in seconds.",
|
507
|
+
labelnames=labels.keys(),
|
508
|
+
buckets=tree_traversal_time_buckets,
|
509
|
+
)
|
510
|
+
self.grammar_tree_traversal_time_max = Histogram(
|
511
|
+
name="sglang:grammar_tree_traversal_time_max",
|
512
|
+
documentation="Histogram of max grammar tree traversal time in seconds.",
|
513
|
+
labelnames=labels.keys(),
|
514
|
+
buckets=tree_traversal_time_buckets,
|
515
|
+
)
|
270
516
|
|
271
517
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
272
518
|
# Convenience function for logging to gauge.
|
273
519
|
gauge.labels(**self.labels).set(data)
|
274
520
|
|
521
|
+
def log_histogram(self, histogram, data: Union[int, float]) -> None:
|
522
|
+
histogram.labels(**self.labels).observe(data)
|
523
|
+
|
275
524
|
def increment_bootstrap_failed_reqs(self) -> None:
|
276
525
|
self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
|
277
526
|
|
@@ -282,14 +531,19 @@ class SchedulerMetricsCollector:
|
|
282
531
|
self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
|
283
532
|
self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
|
284
533
|
self._log_gauge(self.token_usage, stats.token_usage)
|
534
|
+
self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
|
285
535
|
self._log_gauge(self.gen_throughput, stats.gen_throughput)
|
286
536
|
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
|
287
537
|
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
538
|
+
self._log_gauge(
|
539
|
+
self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
|
540
|
+
)
|
288
541
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
542
|
+
|
543
|
+
# Speculative decoding
|
289
544
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
290
|
-
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
291
545
|
|
292
|
-
#
|
546
|
+
# PD disaggregation
|
293
547
|
self._log_gauge(
|
294
548
|
self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
|
295
549
|
)
|
@@ -302,14 +556,59 @@ class SchedulerMetricsCollector:
|
|
302
556
|
self._log_gauge(
|
303
557
|
self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
|
304
558
|
)
|
559
|
+
self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
|
560
|
+
self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
|
561
|
+
|
562
|
+
# Retract
|
563
|
+
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
564
|
+
self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
|
565
|
+
self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
|
566
|
+
|
567
|
+
# Utilization
|
568
|
+
self._log_gauge(self.utilization, stats.utilization)
|
569
|
+
if stats.max_running_requests_under_SLO is not None:
|
570
|
+
self._log_gauge(
|
571
|
+
self.max_running_requests_under_SLO,
|
572
|
+
stats.max_running_requests_under_SLO,
|
573
|
+
)
|
574
|
+
|
575
|
+
# Engine startup time
|
576
|
+
self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
|
577
|
+
if stats.engine_load_weights_time is not None:
|
578
|
+
self._log_gauge(
|
579
|
+
self.engine_load_weights_time, stats.engine_load_weights_time
|
580
|
+
)
|
305
581
|
|
306
582
|
self.last_log_time = time.perf_counter()
|
307
583
|
|
584
|
+
def log_grammar_stats(self, grammar_stats) -> None:
|
585
|
+
# Duck-typed GrammarStats to avoid cross-package dependency
|
586
|
+
if getattr(grammar_stats, "compilation_time", None) is not None:
|
587
|
+
self.log_histogram(
|
588
|
+
self.grammar_compilation_time, grammar_stats.compilation_time
|
589
|
+
)
|
590
|
+
if getattr(grammar_stats, "schema_count", None) is not None:
|
591
|
+
self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
|
592
|
+
if getattr(grammar_stats, "ebnf_size", None) is not None:
|
593
|
+
self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
|
594
|
+
tree_times = getattr(grammar_stats, "tree_traversal_time", None)
|
595
|
+
if tree_times:
|
596
|
+
max_time = max(tree_times)
|
597
|
+
avg_time = sum(tree_times) / len(tree_times)
|
598
|
+
self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
|
599
|
+
self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
|
600
|
+
if getattr(grammar_stats, "is_cache_hit", False):
|
601
|
+
self.num_grammar_cache_hit.labels(**self.labels).inc(1)
|
602
|
+
if getattr(grammar_stats, "is_grammar_aborted", False):
|
603
|
+
self.num_grammar_aborted.labels(**self.labels).inc(1)
|
604
|
+
self.num_grammar_total.labels(**self.labels).inc(1)
|
605
|
+
|
308
606
|
|
309
607
|
class TokenizerMetricsCollector:
|
310
608
|
def __init__(
|
311
609
|
self,
|
312
|
-
|
610
|
+
server_args: Optional[ServerArgs] = None,
|
611
|
+
labels: Dict[str, str] = None,
|
313
612
|
bucket_time_to_first_token: Optional[List[float]] = None,
|
314
613
|
bucket_inter_token_latency: Optional[List[float]] = None,
|
315
614
|
bucket_e2e_request_latency: Optional[List[float]] = None,
|
@@ -318,7 +617,7 @@ class TokenizerMetricsCollector:
|
|
318
617
|
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
319
618
|
from prometheus_client import Counter, Histogram
|
320
619
|
|
321
|
-
self.labels = labels
|
620
|
+
self.labels = labels or {}
|
322
621
|
self.collect_tokens_histogram = collect_tokens_histogram
|
323
622
|
|
324
623
|
self.prompt_tokens_total = Counter(
|
@@ -334,7 +633,7 @@ class TokenizerMetricsCollector:
|
|
334
633
|
)
|
335
634
|
|
336
635
|
if collect_tokens_histogram:
|
337
|
-
|
636
|
+
default_bucket_prompt_tokens = [
|
338
637
|
100,
|
339
638
|
300,
|
340
639
|
500,
|
@@ -358,39 +657,30 @@ class TokenizerMetricsCollector:
|
|
358
657
|
30000,
|
359
658
|
35000,
|
360
659
|
40000,
|
660
|
+
66000,
|
661
|
+
99000,
|
662
|
+
132000,
|
663
|
+
300000,
|
664
|
+
600000,
|
665
|
+
900000,
|
666
|
+
1100000,
|
361
667
|
]
|
362
668
|
self.prompt_tokens_histogram = Histogram(
|
363
669
|
name="sglang:prompt_tokens_histogram",
|
364
670
|
documentation="Histogram of prompt token length.",
|
365
671
|
labelnames=labels.keys(),
|
366
|
-
buckets=
|
672
|
+
buckets=generate_buckets(
|
673
|
+
server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
|
674
|
+
),
|
367
675
|
)
|
368
|
-
bucket_generation_tokens = [
|
369
|
-
100,
|
370
|
-
300,
|
371
|
-
500,
|
372
|
-
1000,
|
373
|
-
1200,
|
374
|
-
1500,
|
375
|
-
1700,
|
376
|
-
2000,
|
377
|
-
2500,
|
378
|
-
3000,
|
379
|
-
3500,
|
380
|
-
4000,
|
381
|
-
4500,
|
382
|
-
5000,
|
383
|
-
6000,
|
384
|
-
7000,
|
385
|
-
8000,
|
386
|
-
9000,
|
387
|
-
10000,
|
388
|
-
]
|
389
676
|
self.generation_tokens_histogram = Histogram(
|
390
677
|
name="sglang:generation_tokens_histogram",
|
391
678
|
documentation="Histogram of generation token length.",
|
392
679
|
labelnames=labels.keys(),
|
393
|
-
buckets=
|
680
|
+
buckets=generate_buckets(
|
681
|
+
server_args.generation_tokens_buckets,
|
682
|
+
default_bucket_prompt_tokens,
|
683
|
+
),
|
394
684
|
)
|
395
685
|
|
396
686
|
self.cached_tokens_total = Counter(
|
@@ -459,7 +749,10 @@ class TokenizerMetricsCollector:
|
|
459
749
|
100,
|
460
750
|
200,
|
461
751
|
400,
|
462
|
-
|
752
|
+
600,
|
753
|
+
1200,
|
754
|
+
1800,
|
755
|
+
2400,
|
463
756
|
]
|
464
757
|
|
465
758
|
if bucket_inter_token_latency is None:
|
@@ -510,6 +803,14 @@ class TokenizerMetricsCollector:
|
|
510
803
|
buckets=bucket_e2e_request_latency,
|
511
804
|
)
|
512
805
|
|
806
|
+
# Offline batch specific TTFB histogram
|
807
|
+
self.histogram_time_to_first_token_offline_batch = Histogram(
|
808
|
+
name="sglang:time_to_first_token_seconds_offline_batch",
|
809
|
+
documentation="Histogram of time to first token in seconds for offline batch requests.",
|
810
|
+
labelnames=labels.keys(),
|
811
|
+
buckets=bucket_time_to_first_token,
|
812
|
+
)
|
813
|
+
|
513
814
|
def _log_histogram(self, histogram, data: Union[int, float]) -> None:
|
514
815
|
histogram.labels(**self.labels).observe(data)
|
515
816
|
|
@@ -533,8 +834,26 @@ class TokenizerMetricsCollector:
|
|
533
834
|
self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
|
534
835
|
self._log_histogram(self.generation_tokens_histogram, generation_tokens)
|
535
836
|
|
536
|
-
def observe_time_to_first_token(self, value: float):
|
537
|
-
|
837
|
+
def observe_time_to_first_token(self, value: float, label: str = ""):
|
838
|
+
if label == "batch":
|
839
|
+
self.histogram_time_to_first_token_offline_batch.labels(
|
840
|
+
**self.labels
|
841
|
+
).observe(value)
|
842
|
+
else:
|
843
|
+
self.histogram_time_to_first_token.labels(**self.labels).observe(value)
|
844
|
+
|
845
|
+
def check_time_to_first_token_straggler(self, value: float) -> bool:
|
846
|
+
his = self.histogram_time_to_first_token.labels(**self.labels)
|
847
|
+
total_observations = sum(bucket._value for bucket in his._buckets)
|
848
|
+
if total_observations < 100:
|
849
|
+
return False
|
850
|
+
p99_threshold = total_observations * 0.99
|
851
|
+
cumulative_count = 0
|
852
|
+
for i, bucket in enumerate(his._buckets):
|
853
|
+
cumulative_count += bucket._value
|
854
|
+
if cumulative_count > p99_threshold:
|
855
|
+
return value >= his._upper_bounds[i]
|
856
|
+
return False
|
538
857
|
|
539
858
|
def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
|
540
859
|
adjusted_interval = internval / num_new_tokens
|
@@ -551,3 +870,105 @@ class TokenizerMetricsCollector:
|
|
551
870
|
|
552
871
|
def observe_one_aborted_request(self):
|
553
872
|
self.num_aborted_requests_total.labels(**self.labels).inc(1)
|
873
|
+
|
874
|
+
|
875
|
+
@dataclass
|
876
|
+
class StorageMetrics:
|
877
|
+
prefetch_pgs: List[int] = field(default_factory=list)
|
878
|
+
backup_pgs: List[int] = field(default_factory=list)
|
879
|
+
prefetch_bandwidth: List[float] = field(default_factory=list)
|
880
|
+
backup_bandwidth: List[float] = field(default_factory=list)
|
881
|
+
|
882
|
+
|
883
|
+
class StorageMetricsCollector:
|
884
|
+
def __init__(
|
885
|
+
self,
|
886
|
+
labels: Dict[str, str],
|
887
|
+
):
|
888
|
+
from prometheus_client import Counter, Histogram
|
889
|
+
|
890
|
+
self.labels = labels
|
891
|
+
|
892
|
+
self.prefetched_tokens_total = Counter(
|
893
|
+
name="sglang:prefetched_tokens_total",
|
894
|
+
documentation="Number of prefetched prompt tokens.",
|
895
|
+
labelnames=labels.keys(),
|
896
|
+
)
|
897
|
+
|
898
|
+
self.backuped_tokens_total = Counter(
|
899
|
+
name="sglang:backuped_tokens_total",
|
900
|
+
documentation="Number of backuped tokens.",
|
901
|
+
labelnames=labels.keys(),
|
902
|
+
)
|
903
|
+
|
904
|
+
bucket_io = [
|
905
|
+
1,
|
906
|
+
5,
|
907
|
+
10,
|
908
|
+
50,
|
909
|
+
100,
|
910
|
+
]
|
911
|
+
|
912
|
+
bucket_bandwidth = [
|
913
|
+
0.1,
|
914
|
+
0.5,
|
915
|
+
1,
|
916
|
+
5,
|
917
|
+
10,
|
918
|
+
50,
|
919
|
+
100,
|
920
|
+
]
|
921
|
+
|
922
|
+
self.histogram_prefetch_pgs = Histogram(
|
923
|
+
name="sglang:prefetch_pgs",
|
924
|
+
documentation="Histogram of prefetch pages of batches.",
|
925
|
+
labelnames=labels.keys(),
|
926
|
+
buckets=bucket_io,
|
927
|
+
)
|
928
|
+
|
929
|
+
self.histogram_backup_pgs = Histogram(
|
930
|
+
name="sglang:backup_pgs",
|
931
|
+
documentation="Histogram of backup pages of batches.",
|
932
|
+
labelnames=labels.keys(),
|
933
|
+
buckets=bucket_io,
|
934
|
+
)
|
935
|
+
|
936
|
+
self.histogram_prefetch_bandwidth = Histogram(
|
937
|
+
name="sglang:prefetch_bandwidth",
|
938
|
+
documentation="Histogram of prefetch bandwidth in GB/s.",
|
939
|
+
labelnames=labels.keys(),
|
940
|
+
buckets=bucket_bandwidth,
|
941
|
+
)
|
942
|
+
|
943
|
+
self.histogram_backup_bandwidth = Histogram(
|
944
|
+
name="sglang:backup_bandwidth",
|
945
|
+
documentation="Histogram of backup bandwidth in GB/s.",
|
946
|
+
labelnames=labels.keys(),
|
947
|
+
buckets=bucket_bandwidth,
|
948
|
+
)
|
949
|
+
|
950
|
+
def log_prefetched_tokens(self, prefetched_tokens: int):
|
951
|
+
if prefetched_tokens > 0:
|
952
|
+
self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
|
953
|
+
|
954
|
+
def log_backuped_tokens(self, backuped_tokens: int):
|
955
|
+
if backuped_tokens > 0:
|
956
|
+
self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
|
957
|
+
|
958
|
+
def _log_histogram(self, histogram, data: Union[int, float]):
|
959
|
+
histogram.labels(**self.labels).observe(data)
|
960
|
+
|
961
|
+
def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
|
962
|
+
if storage_metrics is None:
|
963
|
+
return
|
964
|
+
|
965
|
+
assert isinstance(storage_metrics, StorageMetrics)
|
966
|
+
|
967
|
+
for v in storage_metrics.prefetch_pgs:
|
968
|
+
self._log_histogram(self.histogram_prefetch_pgs, v)
|
969
|
+
for v in storage_metrics.backup_pgs:
|
970
|
+
self._log_histogram(self.histogram_backup_pgs, v)
|
971
|
+
for v in storage_metrics.prefetch_bandwidth:
|
972
|
+
self._log_histogram(self.histogram_prefetch_bandwidth, v)
|
973
|
+
for v in storage_metrics.backup_bandwidth:
|
974
|
+
self._log_histogram(self.histogram_backup_bandwidth, v)
|