sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/device_config.py +3 -1
  5. sglang/srt/configs/dots_vlm.py +139 -0
  6. sglang/srt/configs/load_config.py +1 -0
  7. sglang/srt/configs/model_config.py +50 -6
  8. sglang/srt/configs/qwen3_next.py +326 -0
  9. sglang/srt/connector/__init__.py +8 -1
  10. sglang/srt/connector/remote_instance.py +82 -0
  11. sglang/srt/constrained/base_grammar_backend.py +48 -12
  12. sglang/srt/constrained/llguidance_backend.py +0 -1
  13. sglang/srt/constrained/outlines_backend.py +0 -1
  14. sglang/srt/constrained/xgrammar_backend.py +28 -9
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/base/conn.py +1 -1
  21. sglang/srt/disaggregation/common/conn.py +15 -12
  22. sglang/srt/disaggregation/decode.py +21 -10
  23. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -445
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +5 -3
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +24 -3
  31. sglang/srt/entrypoints/engine.py +38 -17
  32. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  33. sglang/srt/entrypoints/grpc_server.py +680 -0
  34. sglang/srt/entrypoints/http_server.py +85 -54
  35. sglang/srt/entrypoints/openai/protocol.py +4 -1
  36. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  37. sglang/srt/entrypoints/openai/serving_chat.py +36 -16
  38. sglang/srt/entrypoints/openai/serving_completions.py +12 -3
  39. sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
  40. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  41. sglang/srt/entrypoints/openai/serving_responses.py +6 -3
  42. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  43. sglang/srt/eplb/eplb_manager.py +2 -2
  44. sglang/srt/eplb/expert_distribution.py +26 -13
  45. sglang/srt/eplb/expert_location.py +8 -3
  46. sglang/srt/eplb/expert_location_updater.py +1 -1
  47. sglang/srt/function_call/base_format_detector.py +3 -6
  48. sglang/srt/function_call/ebnf_composer.py +11 -9
  49. sglang/srt/function_call/function_call_parser.py +6 -0
  50. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  51. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  52. sglang/srt/grpc/__init__.py +1 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  56. sglang/srt/hf_transformers_utils.py +4 -0
  57. sglang/srt/layers/activation.py +142 -9
  58. sglang/srt/layers/attention/ascend_backend.py +11 -4
  59. sglang/srt/layers/attention/fla/chunk.py +242 -0
  60. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  61. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  62. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  63. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  66. sglang/srt/layers/attention/fla/index.py +37 -0
  67. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  68. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  69. sglang/srt/layers/attention/fla/op.py +66 -0
  70. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  71. sglang/srt/layers/attention/fla/utils.py +331 -0
  72. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  73. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  74. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  75. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  76. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  77. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  78. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  79. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  80. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  81. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  82. sglang/srt/layers/attention/triton_backend.py +18 -1
  83. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  84. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  85. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  86. sglang/srt/layers/dp_attention.py +30 -1
  87. sglang/srt/layers/layernorm.py +32 -15
  88. sglang/srt/layers/linear.py +34 -3
  89. sglang/srt/layers/logits_processor.py +29 -10
  90. sglang/srt/layers/moe/__init__.py +2 -1
  91. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  92. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  93. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  94. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  95. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  96. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  100. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  107. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  108. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  109. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  110. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  111. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  112. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  113. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  114. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  115. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  116. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  117. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  118. sglang/srt/layers/moe/topk.py +30 -9
  119. sglang/srt/layers/moe/utils.py +12 -6
  120. sglang/srt/layers/quantization/awq.py +19 -7
  121. sglang/srt/layers/quantization/base_config.py +11 -6
  122. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  123. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  124. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  125. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  126. sglang/srt/layers/quantization/fp8.py +76 -47
  127. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  128. sglang/srt/layers/quantization/gptq.py +25 -17
  129. sglang/srt/layers/quantization/modelopt_quant.py +147 -47
  130. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  131. sglang/srt/layers/quantization/mxfp4.py +64 -40
  132. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  133. sglang/srt/layers/quantization/unquant.py +135 -47
  134. sglang/srt/layers/quantization/w4afp8.py +30 -17
  135. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  136. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  137. sglang/srt/layers/sampler.py +162 -18
  138. sglang/srt/lora/backend/base_backend.py +50 -8
  139. sglang/srt/lora/backend/triton_backend.py +90 -2
  140. sglang/srt/lora/layers.py +32 -0
  141. sglang/srt/lora/lora.py +4 -1
  142. sglang/srt/lora/lora_manager.py +35 -112
  143. sglang/srt/lora/mem_pool.py +24 -10
  144. sglang/srt/lora/utils.py +18 -9
  145. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  146. sglang/srt/managers/cache_controller.py +158 -160
  147. sglang/srt/managers/data_parallel_controller.py +105 -35
  148. sglang/srt/managers/detokenizer_manager.py +8 -4
  149. sglang/srt/managers/disagg_service.py +46 -0
  150. sglang/srt/managers/io_struct.py +199 -12
  151. sglang/srt/managers/mm_utils.py +1 -0
  152. sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
  153. sglang/srt/managers/schedule_batch.py +77 -56
  154. sglang/srt/managers/schedule_policy.py +1 -1
  155. sglang/srt/managers/scheduler.py +187 -39
  156. sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
  157. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  158. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  159. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  160. sglang/srt/managers/tokenizer_manager.py +259 -519
  161. sglang/srt/managers/tp_worker.py +53 -4
  162. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  163. sglang/srt/mem_cache/hicache_storage.py +3 -23
  164. sglang/srt/mem_cache/hiradix_cache.py +103 -43
  165. sglang/srt/mem_cache/memory_pool.py +347 -48
  166. sglang/srt/mem_cache/memory_pool_host.py +105 -46
  167. sglang/srt/mem_cache/radix_cache.py +0 -2
  168. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  169. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  170. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
  171. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  172. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  173. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
  174. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  175. sglang/srt/metrics/collector.py +493 -76
  176. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  177. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  178. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  179. sglang/srt/model_executor/forward_batch_info.py +59 -2
  180. sglang/srt/model_executor/model_runner.py +356 -29
  181. sglang/srt/model_loader/__init__.py +9 -3
  182. sglang/srt/model_loader/loader.py +128 -4
  183. sglang/srt/model_loader/weight_utils.py +2 -1
  184. sglang/srt/models/apertus.py +686 -0
  185. sglang/srt/models/bailing_moe.py +798 -218
  186. sglang/srt/models/bailing_moe_nextn.py +168 -0
  187. sglang/srt/models/deepseek_v2.py +109 -15
  188. sglang/srt/models/dots_vlm.py +174 -0
  189. sglang/srt/models/dots_vlm_vit.py +337 -0
  190. sglang/srt/models/ernie4.py +1 -1
  191. sglang/srt/models/gemma3n_mm.py +1 -1
  192. sglang/srt/models/glm4_moe.py +1 -1
  193. sglang/srt/models/glm4v.py +4 -2
  194. sglang/srt/models/glm4v_moe.py +3 -0
  195. sglang/srt/models/gpt_oss.py +1 -1
  196. sglang/srt/models/llama4.py +9 -0
  197. sglang/srt/models/llama_eagle3.py +13 -0
  198. sglang/srt/models/longcat_flash.py +2 -2
  199. sglang/srt/models/mllama4.py +25 -0
  200. sglang/srt/models/opt.py +637 -0
  201. sglang/srt/models/qwen2.py +7 -0
  202. sglang/srt/models/qwen2_5_vl.py +27 -3
  203. sglang/srt/models/qwen2_moe.py +56 -12
  204. sglang/srt/models/qwen3_moe.py +1 -1
  205. sglang/srt/models/qwen3_next.py +1042 -0
  206. sglang/srt/models/qwen3_next_mtp.py +112 -0
  207. sglang/srt/models/step3_vl.py +1 -1
  208. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  209. sglang/srt/multimodal/processors/glm4v.py +9 -9
  210. sglang/srt/multimodal/processors/internvl.py +141 -129
  211. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  212. sglang/srt/offloader.py +27 -3
  213. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  214. sglang/srt/sampling/sampling_batch_info.py +18 -15
  215. sglang/srt/server_args.py +276 -35
  216. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  217. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  218. sglang/srt/speculative/eagle_utils.py +0 -2
  219. sglang/srt/speculative/eagle_worker.py +43 -4
  220. sglang/srt/speculative/spec_info.py +5 -0
  221. sglang/srt/speculative/standalone_worker.py +109 -0
  222. sglang/srt/tracing/trace.py +552 -0
  223. sglang/srt/utils.py +34 -3
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  226. sglang/test/runners.py +4 -0
  227. sglang/test/test_cutlass_moe.py +24 -6
  228. sglang/test/test_disaggregation_utils.py +66 -0
  229. sglang/test/test_fp4_moe.py +370 -1
  230. sglang/test/test_utils.py +28 -1
  231. sglang/utils.py +11 -0
  232. sglang/version.py +1 -1
  233. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  234. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
  235. sglang/srt/disaggregation/launch_lb.py +0 -118
  236. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  237. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  238. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -12,9 +12,8 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Utilities for Prometheus Metrics Collection."""
15
-
16
15
  import time
17
- from dataclasses import dataclass
16
+ from dataclasses import dataclass, field
18
17
  from enum import Enum
19
18
  from typing import Dict, List, Optional, Union
20
19
 
@@ -50,6 +49,9 @@ class TimeStats:
50
49
  DECODE = "decode"
51
50
  INVALID = "invalid"
52
51
 
52
+ def get_queueing_time(self) -> float:
53
+ return self.forward_entry_time - self.wait_queue_entry_time
54
+
53
55
  def __str__(self) -> str:
54
56
  # if unified
55
57
  _type = self.get_type()
@@ -134,27 +136,48 @@ class TimeStats:
134
136
 
135
137
  @dataclass
136
138
  class SchedulerStats:
139
+ # Basics
137
140
  num_running_reqs: int = 0
138
141
  num_used_tokens: int = 0
139
142
  token_usage: float = 0.0
143
+ swa_token_usage: float = 0.0
140
144
  gen_throughput: float = 0.0
141
145
  num_queue_reqs: int = 0
142
- cache_hit_rate: float = 0.0
143
146
  num_grammar_queue_reqs: int = 0
144
- spec_accept_length: float = 0.0
147
+ num_running_reqs_offline_batch: int = 0
145
148
  avg_request_queue_latency: float = 0.0
149
+ cache_hit_rate: float = 0.0
150
+
151
+ # Speculative decoding
152
+ spec_accept_length: float = 0.0
153
+
154
+ # PD disaggregation
146
155
  num_prefill_prealloc_queue_reqs: int = 0
147
156
  num_prefill_inflight_queue_reqs: int = 0
148
157
  num_decode_prealloc_queue_reqs: int = 0
149
158
  num_decode_transfer_queue_reqs: int = 0
159
+ kv_transfer_speed_gb_s: float = 0.0
160
+ kv_transfer_latency_ms: float = 0.0
161
+
162
+ # Retract
150
163
  total_retracted_reqs: int = 0
164
+ num_retracted_reqs: int = 0
165
+ num_paused_reqs: int = 0
166
+
167
+ # Utilization
168
+ utilization: float = 0.0
169
+ max_running_requests_under_SLO: Optional[int] = None
170
+
171
+ # Engine startup
172
+ engine_startup_time: float = 0.0
173
+ engine_load_weights_time: float = 0.0
151
174
 
152
175
 
153
176
  class SchedulerMetricsCollector:
154
177
 
155
178
  def __init__(self, labels: Dict[str, str]) -> None:
156
179
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
157
- from prometheus_client import Counter, Gauge
180
+ from prometheus_client import Counter, Gauge, Histogram
158
181
 
159
182
  self.labels = labels
160
183
  self.last_log_time = time.perf_counter()
@@ -165,115 +188,338 @@ class SchedulerMetricsCollector:
165
188
  labelnames=labels.keys(),
166
189
  multiprocess_mode="mostrecent",
167
190
  )
168
-
169
191
  self.num_used_tokens = Gauge(
170
192
  name="sglang:num_used_tokens",
171
193
  documentation="The number of used tokens.",
172
194
  labelnames=labels.keys(),
173
195
  multiprocess_mode="mostrecent",
174
196
  )
175
-
176
197
  self.token_usage = Gauge(
177
198
  name="sglang:token_usage",
178
199
  documentation="The token usage.",
179
200
  labelnames=labels.keys(),
180
201
  multiprocess_mode="mostrecent",
181
202
  )
182
-
203
+ self.swa_token_usage = Gauge(
204
+ name="sglang:swa_token_usage",
205
+ documentation="The token usage for SWA layers.",
206
+ labelnames=labels.keys(),
207
+ multiprocess_mode="mostrecent",
208
+ )
183
209
  self.gen_throughput = Gauge(
184
210
  name="sglang:gen_throughput",
185
211
  documentation="The generation throughput (token/s).",
186
212
  labelnames=labels.keys(),
187
213
  multiprocess_mode="mostrecent",
188
214
  )
189
-
190
215
  self.num_queue_reqs = Gauge(
191
216
  name="sglang:num_queue_reqs",
192
217
  documentation="The number of requests in the waiting queue.",
193
218
  labelnames=labels.keys(),
194
219
  multiprocess_mode="mostrecent",
195
220
  )
196
-
197
221
  self.num_grammar_queue_reqs = Gauge(
198
222
  name="sglang:num_grammar_queue_reqs",
199
223
  documentation="The number of requests in the grammar waiting queue.",
200
224
  labelnames=labels.keys(),
201
225
  multiprocess_mode="mostrecent",
202
226
  )
203
-
204
- self.cache_hit_rate = Gauge(
205
- name="sglang:cache_hit_rate",
206
- documentation="The prefix cache hit rate.",
207
- labelnames=labels.keys(),
208
- multiprocess_mode="mostrecent",
209
- )
210
-
211
- self.spec_accept_length = Gauge(
212
- name="sglang:spec_accept_length",
213
- documentation="The average acceptance length of speculative decoding.",
227
+ self.num_running_reqs_offline_batch = Gauge(
228
+ name="sglang:num_running_reqs_offline_batch",
229
+ documentation="The number of running low-priority offline batch requests(label is 'batch').",
214
230
  labelnames=labels.keys(),
215
231
  multiprocess_mode="mostrecent",
216
232
  )
217
-
218
233
  self.avg_request_queue_latency = Gauge(
219
234
  name="sglang:avg_request_queue_latency",
220
235
  documentation="The average request queue latency for the last batch of requests in seconds.",
221
236
  labelnames=labels.keys(),
222
237
  multiprocess_mode="mostrecent",
223
238
  )
239
+ self.cache_hit_rate = Gauge(
240
+ name="sglang:cache_hit_rate",
241
+ documentation="The prefix cache hit rate.",
242
+ labelnames=labels.keys(),
243
+ multiprocess_mode="mostrecent",
244
+ )
224
245
 
225
- self.total_retracted_reqs = Gauge(
226
- name="sglang:total_retracted_reqs",
227
- documentation="The total number of retracted requests due to kvcache full.",
246
+ # Speculative decoding
247
+ self.spec_accept_length = Gauge(
248
+ name="sglang:spec_accept_length",
249
+ documentation="The average acceptance length of speculative decoding.",
228
250
  labelnames=labels.keys(),
229
251
  multiprocess_mode="mostrecent",
230
252
  )
231
253
 
232
- # Disaggregation queue metrics
254
+ # PD disaggregation
233
255
  self.num_prefill_prealloc_queue_reqs = Gauge(
234
256
  name="sglang:num_prefill_prealloc_queue_reqs",
235
257
  documentation="The number of requests in the prefill prealloc queue.",
236
258
  labelnames=labels.keys(),
237
259
  multiprocess_mode="mostrecent",
238
260
  )
239
-
240
261
  self.num_prefill_inflight_queue_reqs = Gauge(
241
262
  name="sglang:num_prefill_inflight_queue_reqs",
242
263
  documentation="The number of requests in the prefill inflight queue.",
243
264
  labelnames=labels.keys(),
244
265
  multiprocess_mode="mostrecent",
245
266
  )
246
-
247
267
  self.num_decode_prealloc_queue_reqs = Gauge(
248
268
  name="sglang:num_decode_prealloc_queue_reqs",
249
269
  documentation="The number of requests in the decode prealloc queue.",
250
270
  labelnames=labels.keys(),
251
271
  multiprocess_mode="mostrecent",
252
272
  )
253
-
254
273
  self.num_decode_transfer_queue_reqs = Gauge(
255
274
  name="sglang:num_decode_transfer_queue_reqs",
256
275
  documentation="The number of requests in the decode transfer queue.",
257
276
  labelnames=labels.keys(),
258
277
  multiprocess_mode="mostrecent",
259
278
  )
260
-
261
279
  self.num_bootstrap_failed_reqs = Counter(
262
- name="sglang:num_bootstrap_failed_reqs",
280
+ name="sglang:num_bootstrap_failed_reqs_total",
263
281
  documentation="The number of bootstrap failed requests.",
264
282
  labelnames=labels.keys(),
265
283
  )
266
-
267
284
  self.num_transfer_failed_reqs = Counter(
268
- name="sglang:num_transfer_failed_reqs",
285
+ name="sglang:num_transfer_failed_reqs_total",
269
286
  documentation="The number of transfer failed requests.",
270
287
  labelnames=labels.keys(),
271
288
  )
289
+ self.kv_transfer_speed_gb_s = Gauge(
290
+ name="sglang:kv_transfer_speed_gb_s",
291
+ documentation="The transfer speed of the KV cache in GB/s.",
292
+ labelnames=labels.keys(),
293
+ multiprocess_mode="mostrecent",
294
+ )
295
+ self.kv_transfer_latency_ms = Gauge(
296
+ name="sglang:kv_transfer_latency_ms",
297
+ documentation="The transfer latency of the KV cache in ms.",
298
+ labelnames=labels.keys(),
299
+ multiprocess_mode="mostrecent",
300
+ )
301
+
302
+ # Retract
303
+ self.total_retracted_reqs = Gauge(
304
+ name="sglang:total_retracted_reqs",
305
+ documentation="The total number of retracted requests due to kvcache full.",
306
+ labelnames=labels.keys(),
307
+ multiprocess_mode="mostrecent",
308
+ )
309
+ self.num_retracted_reqs = Gauge(
310
+ name="sglang:num_retracted_reqs",
311
+ documentation="The number of retracted requests.",
312
+ labelnames=labels.keys(),
313
+ )
314
+ self.num_paused_reqs = Gauge(
315
+ name="sglang:num_paused_reqs",
316
+ documentation="The number of paused requests by async weight sync.",
317
+ labelnames=labels.keys(),
318
+ )
319
+
320
+ # Utilization
321
+ self.utilization = Gauge(
322
+ name="sglang:utilization",
323
+ documentation="The utilization.",
324
+ labelnames=labels.keys(),
325
+ multiprocess_mode="mostrecent",
326
+ )
327
+ self.max_running_requests_under_SLO = Gauge(
328
+ name="sglang:max_running_requests_under_SLO",
329
+ documentation="The maximum number of running requests under SLO.",
330
+ labelnames=labels.keys(),
331
+ multiprocess_mode="mostrecent",
332
+ )
333
+
334
+ # Engine startup
335
+ self.engine_startup_time = Gauge(
336
+ name="sglang:engine_startup_time",
337
+ documentation="The time taken for the engine to start up.",
338
+ labelnames=labels.keys(),
339
+ multiprocess_mode="mostrecent",
340
+ )
341
+ self.engine_load_weights_time = Gauge(
342
+ name="sglang:engine_load_weights_time",
343
+ documentation="The time taken for the engine to load weights.",
344
+ labelnames=labels.keys(),
345
+ multiprocess_mode="mostrecent",
346
+ )
347
+
348
+ # Additional queueing time histogram
349
+ self.queue_time = Histogram(
350
+ name="sglang:queue_time_s",
351
+ documentation="Histogram of queueing time in seconds.",
352
+ labelnames=labels.keys(),
353
+ buckets=[
354
+ 0.0,
355
+ 0.1,
356
+ 0.2,
357
+ 0.5,
358
+ 1,
359
+ 2,
360
+ 3,
361
+ 4,
362
+ 5,
363
+ 10,
364
+ 15,
365
+ 20,
366
+ 30,
367
+ 40,
368
+ 50,
369
+ 60,
370
+ 70,
371
+ 80,
372
+ 90,
373
+ 100,
374
+ 200,
375
+ 300,
376
+ 400,
377
+ 500,
378
+ 600,
379
+ 700,
380
+ 800,
381
+ 900,
382
+ 1000,
383
+ 1200,
384
+ 1400,
385
+ 1600,
386
+ 1800,
387
+ 2000,
388
+ 2500,
389
+ 3000,
390
+ ],
391
+ )
392
+
393
+ # Grammar metrics
394
+ self.grammar_compilation_time = Histogram(
395
+ name="sglang:grammar_compilation_time_seconds",
396
+ documentation="Histogram of grammar compilation time in seconds.",
397
+ labelnames=labels.keys(),
398
+ buckets=[
399
+ 0.0,
400
+ 0.01,
401
+ 0.02,
402
+ 0.05,
403
+ 0.1,
404
+ 0.2,
405
+ 0.5,
406
+ 1,
407
+ 2,
408
+ 5,
409
+ 10,
410
+ 20,
411
+ 30,
412
+ 60,
413
+ 90,
414
+ 120,
415
+ 240,
416
+ ],
417
+ )
418
+ self.num_grammar_cache_hit = Counter(
419
+ name="sglang:num_grammar_cache_hit_total",
420
+ documentation="Number of grammar cache hits.",
421
+ labelnames=labels.keys(),
422
+ )
423
+ self.num_grammar_aborted = Counter(
424
+ name="sglang:num_grammar_aborted_total",
425
+ documentation="Number of grammar aborted requests.",
426
+ labelnames=labels.keys(),
427
+ )
428
+ self.num_grammar_total = Counter(
429
+ name="sglang:num_grammar_total",
430
+ documentation="Number of the total grammar requests.",
431
+ labelnames=labels.keys(),
432
+ )
433
+ self.grammar_schema_count = Histogram(
434
+ name="sglang:grammar_schema_count",
435
+ documentation="Histogram of grammar schema count.",
436
+ labelnames=labels.keys(),
437
+ buckets=[
438
+ 0,
439
+ 1,
440
+ 2,
441
+ 5,
442
+ 10,
443
+ 20,
444
+ 30,
445
+ 40,
446
+ 60,
447
+ 80,
448
+ 100,
449
+ 120,
450
+ 140,
451
+ 160,
452
+ 180,
453
+ 200,
454
+ 300,
455
+ 400,
456
+ 500,
457
+ 700,
458
+ 1000,
459
+ ],
460
+ )
461
+ self.grammar_ebnf_size = Histogram(
462
+ name="sglang:grammar_ebnf_size",
463
+ documentation="Histogram of grammar EBNF size.",
464
+ labelnames=labels.keys(),
465
+ buckets=[
466
+ 0,
467
+ 50,
468
+ 100,
469
+ 200,
470
+ 300,
471
+ 500,
472
+ 1000,
473
+ 2000,
474
+ 3000,
475
+ 5000,
476
+ 10000,
477
+ 20000,
478
+ 30000,
479
+ 50000,
480
+ 100000,
481
+ ],
482
+ )
483
+
484
+ tree_traversal_time_buckets = [
485
+ 0.0,
486
+ 0.01,
487
+ 0.02,
488
+ 0.05,
489
+ 0.1,
490
+ 0.2,
491
+ 0.5,
492
+ 1,
493
+ 2,
494
+ 5,
495
+ 10,
496
+ 15,
497
+ 30,
498
+ 60,
499
+ 90,
500
+ 120,
501
+ 240,
502
+ ]
503
+ self.grammar_tree_traversal_time_avg = Histogram(
504
+ name="sglang:grammar_tree_traversal_time_avg",
505
+ documentation="Histogram of average grammar tree traversal time in seconds.",
506
+ labelnames=labels.keys(),
507
+ buckets=tree_traversal_time_buckets,
508
+ )
509
+ self.grammar_tree_traversal_time_max = Histogram(
510
+ name="sglang:grammar_tree_traversal_time_max",
511
+ documentation="Histogram of max grammar tree traversal time in seconds.",
512
+ labelnames=labels.keys(),
513
+ buckets=tree_traversal_time_buckets,
514
+ )
272
515
 
273
516
  def _log_gauge(self, gauge, data: Union[int, float]) -> None:
274
517
  # Convenience function for logging to gauge.
275
518
  gauge.labels(**self.labels).set(data)
276
519
 
520
+ def log_histogram(self, histogram, data: Union[int, float]) -> None:
521
+ histogram.labels(**self.labels).observe(data)
522
+
277
523
  def increment_bootstrap_failed_reqs(self) -> None:
278
524
  self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
279
525
 
@@ -284,14 +530,20 @@ class SchedulerMetricsCollector:
284
530
  self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
285
531
  self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
286
532
  self._log_gauge(self.token_usage, stats.token_usage)
533
+ self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
287
534
  self._log_gauge(self.gen_throughput, stats.gen_throughput)
288
535
  self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
289
536
  self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
537
+ self._log_gauge(
538
+ self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
539
+ )
290
540
  self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
541
+ self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
542
+
543
+ # Speculative decoding
291
544
  self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
292
- self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
293
545
 
294
- # Disaggregation metrics
546
+ # PD disaggregation
295
547
  self._log_gauge(
296
548
  self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
297
549
  )
@@ -304,15 +556,59 @@ class SchedulerMetricsCollector:
304
556
  self._log_gauge(
305
557
  self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
306
558
  )
559
+ self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
560
+ self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
561
+
562
+ # Retract
563
+ self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
564
+ self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
565
+ self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
566
+
567
+ # Utilization
568
+ self._log_gauge(self.utilization, stats.utilization)
569
+ if stats.max_running_requests_under_SLO is not None:
570
+ self._log_gauge(
571
+ self.max_running_requests_under_SLO,
572
+ stats.max_running_requests_under_SLO,
573
+ )
574
+
575
+ # Engine startup time
576
+ self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
577
+ if stats.engine_load_weights_time is not None:
578
+ self._log_gauge(
579
+ self.engine_load_weights_time, stats.engine_load_weights_time
580
+ )
307
581
 
308
582
  self.last_log_time = time.perf_counter()
309
583
 
584
+ def log_grammar_stats(self, grammar_stats) -> None:
585
+ # Duck-typed GrammarStats to avoid cross-package dependency
586
+ if getattr(grammar_stats, "compilation_time", None) is not None:
587
+ self.log_histogram(
588
+ self.grammar_compilation_time, grammar_stats.compilation_time
589
+ )
590
+ if getattr(grammar_stats, "schema_count", None) is not None:
591
+ self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
592
+ if getattr(grammar_stats, "ebnf_size", None) is not None:
593
+ self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
594
+ tree_times = getattr(grammar_stats, "tree_traversal_time", None)
595
+ if tree_times:
596
+ max_time = max(tree_times)
597
+ avg_time = sum(tree_times) / len(tree_times)
598
+ self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
599
+ self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
600
+ if getattr(grammar_stats, "is_cache_hit", False):
601
+ self.num_grammar_cache_hit.labels(**self.labels).inc(1)
602
+ if getattr(grammar_stats, "is_grammar_aborted", False):
603
+ self.num_grammar_aborted.labels(**self.labels).inc(1)
604
+ self.num_grammar_total.labels(**self.labels).inc(1)
605
+
310
606
 
311
607
  class TokenizerMetricsCollector:
312
608
  def __init__(
313
609
  self,
314
- server_args: ServerArgs,
315
- labels: Dict[str, str],
610
+ server_args: Optional[ServerArgs] = None,
611
+ labels: Dict[str, str] = None,
316
612
  bucket_time_to_first_token: Optional[List[float]] = None,
317
613
  bucket_inter_token_latency: Optional[List[float]] = None,
318
614
  bucket_e2e_request_latency: Optional[List[float]] = None,
@@ -321,7 +617,7 @@ class TokenizerMetricsCollector:
321
617
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
322
618
  from prometheus_client import Counter, Histogram
323
619
 
324
- self.labels = labels
620
+ self.labels = labels or {}
325
621
  self.collect_tokens_histogram = collect_tokens_histogram
326
622
 
327
623
  self.prompt_tokens_total = Counter(
@@ -361,6 +657,13 @@ class TokenizerMetricsCollector:
361
657
  30000,
362
658
  35000,
363
659
  40000,
660
+ 66000,
661
+ 99000,
662
+ 132000,
663
+ 300000,
664
+ 600000,
665
+ 900000,
666
+ 1100000,
364
667
  ]
365
668
  self.prompt_tokens_histogram = Histogram(
366
669
  name="sglang:prompt_tokens_histogram",
@@ -370,34 +673,13 @@ class TokenizerMetricsCollector:
370
673
  server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
371
674
  ),
372
675
  )
373
- default_bucket_generation_tokens = [
374
- 100,
375
- 300,
376
- 500,
377
- 1000,
378
- 1200,
379
- 1500,
380
- 1700,
381
- 2000,
382
- 2500,
383
- 3000,
384
- 3500,
385
- 4000,
386
- 4500,
387
- 5000,
388
- 6000,
389
- 7000,
390
- 8000,
391
- 9000,
392
- 10000,
393
- ]
394
676
  self.generation_tokens_histogram = Histogram(
395
677
  name="sglang:generation_tokens_histogram",
396
678
  documentation="Histogram of generation token length.",
397
679
  labelnames=labels.keys(),
398
680
  buckets=generate_buckets(
399
681
  server_args.generation_tokens_buckets,
400
- default_bucket_generation_tokens,
682
+ default_bucket_prompt_tokens,
401
683
  ),
402
684
  )
403
685
 
@@ -467,7 +749,10 @@ class TokenizerMetricsCollector:
467
749
  100,
468
750
  200,
469
751
  400,
470
- 800,
752
+ 600,
753
+ 1200,
754
+ 1800,
755
+ 2400,
471
756
  ]
472
757
 
473
758
  if bucket_inter_token_latency is None:
@@ -518,38 +803,68 @@ class TokenizerMetricsCollector:
518
803
  buckets=bucket_e2e_request_latency,
519
804
  )
520
805
 
521
- def _log_histogram(self, histogram, data: Union[int, float]) -> None:
522
- histogram.labels(**self.labels).observe(data)
806
+ # Offline batch specific TTFB histogram
807
+ self.histogram_time_to_first_token_offline_batch = Histogram(
808
+ name="sglang:time_to_first_token_seconds_offline_batch",
809
+ documentation="Histogram of time to first token in seconds for offline batch requests.",
810
+ labelnames=labels.keys(),
811
+ buckets=bucket_time_to_first_token,
812
+ )
523
813
 
524
814
  def observe_one_finished_request(
525
815
  self,
816
+ labels: Dict[str, str],
526
817
  prompt_tokens: int,
527
818
  generation_tokens: int,
528
819
  cached_tokens: int,
529
820
  e2e_latency: float,
530
821
  has_grammar: bool,
531
822
  ):
532
- self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
533
- self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
823
+ self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
824
+ self.generation_tokens_total.labels(**labels).inc(generation_tokens)
534
825
  if cached_tokens > 0:
535
- self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
536
- self.num_requests_total.labels(**self.labels).inc(1)
826
+ self.cached_tokens_total.labels(**labels).inc(cached_tokens)
827
+ self.num_requests_total.labels(**labels).inc(1)
537
828
  if has_grammar:
538
- self.num_so_requests_total.labels(**self.labels).inc(1)
539
- self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
829
+ self.num_so_requests_total.labels(**labels).inc(1)
830
+ self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
540
831
  if self.collect_tokens_histogram:
541
- self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
542
- self._log_histogram(self.generation_tokens_histogram, generation_tokens)
543
-
544
- def observe_time_to_first_token(self, value: float):
545
- self.histogram_time_to_first_token.labels(**self.labels).observe(value)
832
+ self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
833
+ self.generation_tokens_histogram.labels(**labels).observe(
834
+ float(generation_tokens)
835
+ )
546
836
 
547
- def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
837
+ def observe_time_to_first_token(
838
+ self, labels: Dict[str, str], value: float, type: str = ""
839
+ ):
840
+ if type == "batch":
841
+ self.histogram_time_to_first_token_offline_batch.labels(**labels).observe(
842
+ value
843
+ )
844
+ else:
845
+ self.histogram_time_to_first_token.labels(**labels).observe(value)
846
+
847
+ def check_time_to_first_token_straggler(self, value: float) -> bool:
848
+ his = self.histogram_time_to_first_token.labels(**self.labels)
849
+ total_observations = sum(bucket._value for bucket in his._buckets)
850
+ if total_observations < 100:
851
+ return False
852
+ p99_threshold = total_observations * 0.99
853
+ cumulative_count = 0
854
+ for i, bucket in enumerate(his._buckets):
855
+ cumulative_count += bucket._value
856
+ if cumulative_count > p99_threshold:
857
+ return value >= his._upper_bounds[i]
858
+ return False
859
+
860
+ def observe_inter_token_latency(
861
+ self, labels: Dict[str, str], internval: float, num_new_tokens: int
862
+ ):
548
863
  adjusted_interval = internval / num_new_tokens
549
864
 
550
865
  # A faster version of the Histogram::observe which observes multiple values at the same time.
551
866
  # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
552
- his = self.histogram_inter_token_latency_seconds.labels(**self.labels)
867
+ his = self.histogram_inter_token_latency_seconds.labels(**labels)
553
868
  his._sum.inc(internval)
554
869
 
555
870
  for i, bound in enumerate(his._upper_bounds):
@@ -559,3 +874,105 @@ class TokenizerMetricsCollector:
559
874
 
560
875
  def observe_one_aborted_request(self):
561
876
  self.num_aborted_requests_total.labels(**self.labels).inc(1)
877
+
878
+
879
+ @dataclass
880
+ class StorageMetrics:
881
+ prefetch_pgs: List[int] = field(default_factory=list)
882
+ backup_pgs: List[int] = field(default_factory=list)
883
+ prefetch_bandwidth: List[float] = field(default_factory=list)
884
+ backup_bandwidth: List[float] = field(default_factory=list)
885
+
886
+
887
+ class StorageMetricsCollector:
888
+ def __init__(
889
+ self,
890
+ labels: Dict[str, str],
891
+ ):
892
+ from prometheus_client import Counter, Histogram
893
+
894
+ self.labels = labels
895
+
896
+ self.prefetched_tokens_total = Counter(
897
+ name="sglang:prefetched_tokens_total",
898
+ documentation="Number of prefetched prompt tokens.",
899
+ labelnames=labels.keys(),
900
+ )
901
+
902
+ self.backuped_tokens_total = Counter(
903
+ name="sglang:backuped_tokens_total",
904
+ documentation="Number of backuped tokens.",
905
+ labelnames=labels.keys(),
906
+ )
907
+
908
+ bucket_io = [
909
+ 1,
910
+ 5,
911
+ 10,
912
+ 50,
913
+ 100,
914
+ ]
915
+
916
+ bucket_bandwidth = [
917
+ 0.1,
918
+ 0.5,
919
+ 1,
920
+ 5,
921
+ 10,
922
+ 50,
923
+ 100,
924
+ ]
925
+
926
+ self.histogram_prefetch_pgs = Histogram(
927
+ name="sglang:prefetch_pgs",
928
+ documentation="Histogram of prefetch pages of batches.",
929
+ labelnames=labels.keys(),
930
+ buckets=bucket_io,
931
+ )
932
+
933
+ self.histogram_backup_pgs = Histogram(
934
+ name="sglang:backup_pgs",
935
+ documentation="Histogram of backup pages of batches.",
936
+ labelnames=labels.keys(),
937
+ buckets=bucket_io,
938
+ )
939
+
940
+ self.histogram_prefetch_bandwidth = Histogram(
941
+ name="sglang:prefetch_bandwidth",
942
+ documentation="Histogram of prefetch bandwidth in GB/s.",
943
+ labelnames=labels.keys(),
944
+ buckets=bucket_bandwidth,
945
+ )
946
+
947
+ self.histogram_backup_bandwidth = Histogram(
948
+ name="sglang:backup_bandwidth",
949
+ documentation="Histogram of backup bandwidth in GB/s.",
950
+ labelnames=labels.keys(),
951
+ buckets=bucket_bandwidth,
952
+ )
953
+
954
+ def log_prefetched_tokens(self, prefetched_tokens: int):
955
+ if prefetched_tokens > 0:
956
+ self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
957
+
958
+ def log_backuped_tokens(self, backuped_tokens: int):
959
+ if backuped_tokens > 0:
960
+ self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
961
+
962
+ def _log_histogram(self, histogram, data: Union[int, float]):
963
+ histogram.labels(**self.labels).observe(data)
964
+
965
+ def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
966
+ if storage_metrics is None:
967
+ return
968
+
969
+ assert isinstance(storage_metrics, StorageMetrics)
970
+
971
+ for v in storage_metrics.prefetch_pgs:
972
+ self._log_histogram(self.histogram_prefetch_pgs, v)
973
+ for v in storage_metrics.backup_pgs:
974
+ self._log_histogram(self.histogram_backup_pgs, v)
975
+ for v in storage_metrics.prefetch_bandwidth:
976
+ self._log_histogram(self.histogram_prefetch_bandwidth, v)
977
+ for v in storage_metrics.backup_bandwidth:
978
+ self._log_histogram(self.histogram_backup_bandwidth, v)