sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/lang/interpreter.py +1 -1
  4. sglang/srt/configs/__init__.py +4 -0
  5. sglang/srt/configs/device_config.py +3 -1
  6. sglang/srt/configs/dots_vlm.py +139 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/load_config.py +1 -0
  9. sglang/srt/configs/model_config.py +50 -6
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +8 -1
  12. sglang/srt/connector/remote_instance.py +82 -0
  13. sglang/srt/constrained/base_grammar_backend.py +48 -12
  14. sglang/srt/constrained/llguidance_backend.py +0 -1
  15. sglang/srt/constrained/outlines_backend.py +0 -1
  16. sglang/srt/constrained/xgrammar_backend.py +28 -9
  17. sglang/srt/custom_op.py +11 -1
  18. sglang/srt/debug_utils/dump_comparator.py +81 -44
  19. sglang/srt/debug_utils/dump_loader.py +97 -0
  20. sglang/srt/debug_utils/dumper.py +11 -3
  21. sglang/srt/debug_utils/text_comparator.py +73 -11
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +21 -10
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  26. sglang/srt/disaggregation/fake/conn.py +1 -1
  27. sglang/srt/disaggregation/mini_lb.py +6 -445
  28. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  29. sglang/srt/disaggregation/nixl/conn.py +180 -16
  30. sglang/srt/disaggregation/prefill.py +5 -3
  31. sglang/srt/disaggregation/utils.py +5 -50
  32. sglang/srt/distributed/parallel_state.py +67 -43
  33. sglang/srt/entrypoints/engine.py +38 -17
  34. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  35. sglang/srt/entrypoints/grpc_server.py +680 -0
  36. sglang/srt/entrypoints/http_server.py +88 -53
  37. sglang/srt/entrypoints/openai/protocol.py +7 -4
  38. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  39. sglang/srt/entrypoints/openai/serving_chat.py +39 -19
  40. sglang/srt/entrypoints/openai/serving_completions.py +15 -4
  41. sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
  42. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  43. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  44. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  45. sglang/srt/eplb/eplb_manager.py +2 -2
  46. sglang/srt/eplb/expert_distribution.py +26 -13
  47. sglang/srt/eplb/expert_location.py +8 -3
  48. sglang/srt/eplb/expert_location_updater.py +1 -1
  49. sglang/srt/function_call/base_format_detector.py +3 -6
  50. sglang/srt/function_call/ebnf_composer.py +11 -9
  51. sglang/srt/function_call/function_call_parser.py +6 -0
  52. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  53. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  54. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  55. sglang/srt/grpc/__init__.py +1 -0
  56. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  57. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  58. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  59. sglang/srt/hf_transformers_utils.py +4 -0
  60. sglang/srt/layers/activation.py +142 -9
  61. sglang/srt/layers/attention/aiter_backend.py +93 -68
  62. sglang/srt/layers/attention/ascend_backend.py +11 -4
  63. sglang/srt/layers/attention/fla/chunk.py +242 -0
  64. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  65. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  66. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  67. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  68. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  69. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  70. sglang/srt/layers/attention/fla/index.py +37 -0
  71. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  72. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  73. sglang/srt/layers/attention/fla/op.py +66 -0
  74. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  75. sglang/srt/layers/attention/fla/utils.py +331 -0
  76. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  77. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  78. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  79. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  80. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  81. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  82. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  83. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  84. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  85. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  86. sglang/srt/layers/attention/triton_backend.py +18 -1
  87. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  88. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  89. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  90. sglang/srt/layers/communicator.py +45 -7
  91. sglang/srt/layers/dp_attention.py +30 -1
  92. sglang/srt/layers/layernorm.py +32 -15
  93. sglang/srt/layers/linear.py +34 -3
  94. sglang/srt/layers/logits_processor.py +29 -10
  95. sglang/srt/layers/moe/__init__.py +2 -1
  96. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  97. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  98. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  99. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  100. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  107. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  108. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  113. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  114. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  115. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  116. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  117. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  118. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  119. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  120. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  121. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  122. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  123. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  124. sglang/srt/layers/moe/topk.py +30 -9
  125. sglang/srt/layers/moe/utils.py +12 -7
  126. sglang/srt/layers/quantization/awq.py +19 -7
  127. sglang/srt/layers/quantization/base_config.py +11 -6
  128. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  129. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  130. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  131. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  132. sglang/srt/layers/quantization/fp8.py +76 -47
  133. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  134. sglang/srt/layers/quantization/gptq.py +25 -17
  135. sglang/srt/layers/quantization/modelopt_quant.py +182 -49
  136. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  137. sglang/srt/layers/quantization/mxfp4.py +68 -41
  138. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  139. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  140. sglang/srt/layers/quantization/quark/utils.py +97 -0
  141. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  142. sglang/srt/layers/quantization/unquant.py +135 -47
  143. sglang/srt/layers/quantization/w4afp8.py +30 -17
  144. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  145. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  146. sglang/srt/layers/rocm_linear_utils.py +44 -0
  147. sglang/srt/layers/rotary_embedding.py +0 -18
  148. sglang/srt/layers/sampler.py +162 -18
  149. sglang/srt/lora/backend/base_backend.py +50 -8
  150. sglang/srt/lora/backend/triton_backend.py +90 -2
  151. sglang/srt/lora/layers.py +32 -0
  152. sglang/srt/lora/lora.py +4 -1
  153. sglang/srt/lora/lora_manager.py +35 -112
  154. sglang/srt/lora/mem_pool.py +24 -10
  155. sglang/srt/lora/utils.py +18 -9
  156. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  157. sglang/srt/managers/cache_controller.py +200 -199
  158. sglang/srt/managers/data_parallel_controller.py +105 -35
  159. sglang/srt/managers/detokenizer_manager.py +8 -4
  160. sglang/srt/managers/disagg_service.py +46 -0
  161. sglang/srt/managers/io_struct.py +199 -12
  162. sglang/srt/managers/mm_utils.py +1 -0
  163. sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
  164. sglang/srt/managers/schedule_batch.py +77 -56
  165. sglang/srt/managers/schedule_policy.py +4 -3
  166. sglang/srt/managers/scheduler.py +191 -139
  167. sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
  168. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  169. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  170. sglang/srt/managers/template_manager.py +3 -3
  171. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  172. sglang/srt/managers/tokenizer_manager.py +260 -519
  173. sglang/srt/managers/tp_worker.py +53 -4
  174. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  175. sglang/srt/mem_cache/allocator.py +1 -1
  176. sglang/srt/mem_cache/hicache_storage.py +18 -33
  177. sglang/srt/mem_cache/hiradix_cache.py +108 -48
  178. sglang/srt/mem_cache/memory_pool.py +347 -48
  179. sglang/srt/mem_cache/memory_pool_host.py +121 -57
  180. sglang/srt/mem_cache/radix_cache.py +0 -2
  181. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  182. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  183. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
  184. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  185. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  186. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
  187. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  188. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  189. sglang/srt/metrics/collector.py +502 -77
  190. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  191. sglang/srt/metrics/utils.py +48 -0
  192. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  193. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  194. sglang/srt/model_executor/forward_batch_info.py +75 -19
  195. sglang/srt/model_executor/model_runner.py +357 -30
  196. sglang/srt/model_loader/__init__.py +9 -3
  197. sglang/srt/model_loader/loader.py +128 -4
  198. sglang/srt/model_loader/weight_utils.py +2 -1
  199. sglang/srt/models/apertus.py +686 -0
  200. sglang/srt/models/bailing_moe.py +798 -218
  201. sglang/srt/models/bailing_moe_nextn.py +168 -0
  202. sglang/srt/models/deepseek_v2.py +346 -48
  203. sglang/srt/models/dots_vlm.py +174 -0
  204. sglang/srt/models/dots_vlm_vit.py +337 -0
  205. sglang/srt/models/ernie4.py +1 -1
  206. sglang/srt/models/gemma3n_mm.py +1 -1
  207. sglang/srt/models/glm4_moe.py +11 -2
  208. sglang/srt/models/glm4v.py +4 -2
  209. sglang/srt/models/glm4v_moe.py +3 -0
  210. sglang/srt/models/gpt_oss.py +1 -1
  211. sglang/srt/models/internvl.py +28 -0
  212. sglang/srt/models/llama4.py +9 -0
  213. sglang/srt/models/llama_eagle3.py +13 -0
  214. sglang/srt/models/longcat_flash.py +2 -2
  215. sglang/srt/models/minicpmv.py +165 -3
  216. sglang/srt/models/mllama4.py +25 -0
  217. sglang/srt/models/opt.py +637 -0
  218. sglang/srt/models/qwen2.py +7 -0
  219. sglang/srt/models/qwen2_5_vl.py +27 -3
  220. sglang/srt/models/qwen2_moe.py +60 -13
  221. sglang/srt/models/qwen3.py +8 -2
  222. sglang/srt/models/qwen3_moe.py +40 -9
  223. sglang/srt/models/qwen3_next.py +1042 -0
  224. sglang/srt/models/qwen3_next_mtp.py +112 -0
  225. sglang/srt/models/step3_vl.py +1 -1
  226. sglang/srt/models/torch_native_llama.py +1 -1
  227. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  228. sglang/srt/multimodal/processors/glm4v.py +9 -9
  229. sglang/srt/multimodal/processors/internvl.py +141 -129
  230. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  231. sglang/srt/offloader.py +27 -3
  232. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  233. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  234. sglang/srt/sampling/sampling_batch_info.py +18 -15
  235. sglang/srt/server_args.py +355 -37
  236. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  237. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  238. sglang/srt/speculative/eagle_utils.py +0 -2
  239. sglang/srt/speculative/eagle_worker.py +197 -112
  240. sglang/srt/speculative/spec_info.py +5 -0
  241. sglang/srt/speculative/standalone_worker.py +109 -0
  242. sglang/srt/tracing/trace.py +552 -0
  243. sglang/srt/utils.py +46 -3
  244. sglang/srt/weight_sync/utils.py +1 -1
  245. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  246. sglang/test/few_shot_gsm8k.py +1 -0
  247. sglang/test/runners.py +4 -0
  248. sglang/test/test_cutlass_moe.py +24 -6
  249. sglang/test/test_disaggregation_utils.py +66 -0
  250. sglang/test/test_fp4_moe.py +370 -1
  251. sglang/test/test_utils.py +28 -1
  252. sglang/utils.py +12 -0
  253. sglang/version.py +1 -1
  254. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  255. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
  256. sglang/srt/disaggregation/launch_lb.py +0 -118
  257. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  258. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  259. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  260. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  261. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  262. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  263. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  264. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  265. {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -12,12 +12,13 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Utilities for Prometheus Metrics Collection."""
15
-
16
15
  import time
17
- from dataclasses import dataclass
16
+ from dataclasses import dataclass, field
18
17
  from enum import Enum
19
18
  from typing import Dict, List, Optional, Union
20
19
 
20
+ from sglang.srt.metrics.utils import generate_buckets
21
+ from sglang.srt.server_args import ServerArgs
21
22
  from sglang.srt.utils import get_bool_env_var
22
23
 
23
24
  SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
@@ -48,6 +49,9 @@ class TimeStats:
48
49
  DECODE = "decode"
49
50
  INVALID = "invalid"
50
51
 
52
+ def get_queueing_time(self) -> float:
53
+ return self.forward_entry_time - self.wait_queue_entry_time
54
+
51
55
  def __str__(self) -> str:
52
56
  # if unified
53
57
  _type = self.get_type()
@@ -132,27 +136,48 @@ class TimeStats:
132
136
 
133
137
  @dataclass
134
138
  class SchedulerStats:
139
+ # Basics
135
140
  num_running_reqs: int = 0
136
141
  num_used_tokens: int = 0
137
142
  token_usage: float = 0.0
143
+ swa_token_usage: float = 0.0
138
144
  gen_throughput: float = 0.0
139
145
  num_queue_reqs: int = 0
140
- cache_hit_rate: float = 0.0
141
146
  num_grammar_queue_reqs: int = 0
142
- spec_accept_length: float = 0.0
147
+ num_running_reqs_offline_batch: int = 0
143
148
  avg_request_queue_latency: float = 0.0
149
+ cache_hit_rate: float = 0.0
150
+
151
+ # Speculative decoding
152
+ spec_accept_length: float = 0.0
153
+
154
+ # PD disaggregation
144
155
  num_prefill_prealloc_queue_reqs: int = 0
145
156
  num_prefill_inflight_queue_reqs: int = 0
146
157
  num_decode_prealloc_queue_reqs: int = 0
147
158
  num_decode_transfer_queue_reqs: int = 0
159
+ kv_transfer_speed_gb_s: float = 0.0
160
+ kv_transfer_latency_ms: float = 0.0
161
+
162
+ # Retract
148
163
  total_retracted_reqs: int = 0
164
+ num_retracted_reqs: int = 0
165
+ num_paused_reqs: int = 0
166
+
167
+ # Utilization
168
+ utilization: float = 0.0
169
+ max_running_requests_under_SLO: Optional[int] = None
170
+
171
+ # Engine startup
172
+ engine_startup_time: float = 0.0
173
+ engine_load_weights_time: float = 0.0
149
174
 
150
175
 
151
176
  class SchedulerMetricsCollector:
152
177
 
153
178
  def __init__(self, labels: Dict[str, str]) -> None:
154
179
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
155
- from prometheus_client import Counter, Gauge
180
+ from prometheus_client import Counter, Gauge, Histogram
156
181
 
157
182
  self.labels = labels
158
183
  self.last_log_time = time.perf_counter()
@@ -163,115 +188,338 @@ class SchedulerMetricsCollector:
163
188
  labelnames=labels.keys(),
164
189
  multiprocess_mode="mostrecent",
165
190
  )
166
-
167
191
  self.num_used_tokens = Gauge(
168
192
  name="sglang:num_used_tokens",
169
193
  documentation="The number of used tokens.",
170
194
  labelnames=labels.keys(),
171
195
  multiprocess_mode="mostrecent",
172
196
  )
173
-
174
197
  self.token_usage = Gauge(
175
198
  name="sglang:token_usage",
176
199
  documentation="The token usage.",
177
200
  labelnames=labels.keys(),
178
201
  multiprocess_mode="mostrecent",
179
202
  )
180
-
203
+ self.swa_token_usage = Gauge(
204
+ name="sglang:swa_token_usage",
205
+ documentation="The token usage for SWA layers.",
206
+ labelnames=labels.keys(),
207
+ multiprocess_mode="mostrecent",
208
+ )
181
209
  self.gen_throughput = Gauge(
182
210
  name="sglang:gen_throughput",
183
211
  documentation="The generation throughput (token/s).",
184
212
  labelnames=labels.keys(),
185
213
  multiprocess_mode="mostrecent",
186
214
  )
187
-
188
215
  self.num_queue_reqs = Gauge(
189
216
  name="sglang:num_queue_reqs",
190
217
  documentation="The number of requests in the waiting queue.",
191
218
  labelnames=labels.keys(),
192
219
  multiprocess_mode="mostrecent",
193
220
  )
194
-
195
221
  self.num_grammar_queue_reqs = Gauge(
196
222
  name="sglang:num_grammar_queue_reqs",
197
223
  documentation="The number of requests in the grammar waiting queue.",
198
224
  labelnames=labels.keys(),
199
225
  multiprocess_mode="mostrecent",
200
226
  )
201
-
202
- self.cache_hit_rate = Gauge(
203
- name="sglang:cache_hit_rate",
204
- documentation="The prefix cache hit rate.",
205
- labelnames=labels.keys(),
206
- multiprocess_mode="mostrecent",
207
- )
208
-
209
- self.spec_accept_length = Gauge(
210
- name="sglang:spec_accept_length",
211
- documentation="The average acceptance length of speculative decoding.",
227
+ self.num_running_reqs_offline_batch = Gauge(
228
+ name="sglang:num_running_reqs_offline_batch",
229
+ documentation="The number of running low-priority offline batch requests(label is 'batch').",
212
230
  labelnames=labels.keys(),
213
231
  multiprocess_mode="mostrecent",
214
232
  )
215
-
216
233
  self.avg_request_queue_latency = Gauge(
217
234
  name="sglang:avg_request_queue_latency",
218
235
  documentation="The average request queue latency for the last batch of requests in seconds.",
219
236
  labelnames=labels.keys(),
220
237
  multiprocess_mode="mostrecent",
221
238
  )
239
+ self.cache_hit_rate = Gauge(
240
+ name="sglang:cache_hit_rate",
241
+ documentation="The prefix cache hit rate.",
242
+ labelnames=labels.keys(),
243
+ multiprocess_mode="mostrecent",
244
+ )
222
245
 
223
- self.total_retracted_reqs = Gauge(
224
- name="sglang:total_retracted_reqs",
225
- documentation="The total number of retracted requests due to kvcache full.",
246
+ # Speculative decoding
247
+ self.spec_accept_length = Gauge(
248
+ name="sglang:spec_accept_length",
249
+ documentation="The average acceptance length of speculative decoding.",
226
250
  labelnames=labels.keys(),
227
251
  multiprocess_mode="mostrecent",
228
252
  )
229
253
 
230
- # Disaggregation queue metrics
254
+ # PD disaggregation
231
255
  self.num_prefill_prealloc_queue_reqs = Gauge(
232
256
  name="sglang:num_prefill_prealloc_queue_reqs",
233
257
  documentation="The number of requests in the prefill prealloc queue.",
234
258
  labelnames=labels.keys(),
235
259
  multiprocess_mode="mostrecent",
236
260
  )
237
-
238
261
  self.num_prefill_inflight_queue_reqs = Gauge(
239
262
  name="sglang:num_prefill_inflight_queue_reqs",
240
263
  documentation="The number of requests in the prefill inflight queue.",
241
264
  labelnames=labels.keys(),
242
265
  multiprocess_mode="mostrecent",
243
266
  )
244
-
245
267
  self.num_decode_prealloc_queue_reqs = Gauge(
246
268
  name="sglang:num_decode_prealloc_queue_reqs",
247
269
  documentation="The number of requests in the decode prealloc queue.",
248
270
  labelnames=labels.keys(),
249
271
  multiprocess_mode="mostrecent",
250
272
  )
251
-
252
273
  self.num_decode_transfer_queue_reqs = Gauge(
253
274
  name="sglang:num_decode_transfer_queue_reqs",
254
275
  documentation="The number of requests in the decode transfer queue.",
255
276
  labelnames=labels.keys(),
256
277
  multiprocess_mode="mostrecent",
257
278
  )
258
-
259
279
  self.num_bootstrap_failed_reqs = Counter(
260
- name="sglang:num_bootstrap_failed_reqs",
280
+ name="sglang:num_bootstrap_failed_reqs_total",
261
281
  documentation="The number of bootstrap failed requests.",
262
282
  labelnames=labels.keys(),
263
283
  )
264
-
265
284
  self.num_transfer_failed_reqs = Counter(
266
- name="sglang:num_transfer_failed_reqs",
285
+ name="sglang:num_transfer_failed_reqs_total",
267
286
  documentation="The number of transfer failed requests.",
268
287
  labelnames=labels.keys(),
269
288
  )
289
+ self.kv_transfer_speed_gb_s = Gauge(
290
+ name="sglang:kv_transfer_speed_gb_s",
291
+ documentation="The transfer speed of the KV cache in GB/s.",
292
+ labelnames=labels.keys(),
293
+ multiprocess_mode="mostrecent",
294
+ )
295
+ self.kv_transfer_latency_ms = Gauge(
296
+ name="sglang:kv_transfer_latency_ms",
297
+ documentation="The transfer latency of the KV cache in ms.",
298
+ labelnames=labels.keys(),
299
+ multiprocess_mode="mostrecent",
300
+ )
301
+
302
+ # Retract
303
+ self.total_retracted_reqs = Gauge(
304
+ name="sglang:total_retracted_reqs",
305
+ documentation="The total number of retracted requests due to kvcache full.",
306
+ labelnames=labels.keys(),
307
+ multiprocess_mode="mostrecent",
308
+ )
309
+ self.num_retracted_reqs = Gauge(
310
+ name="sglang:num_retracted_reqs",
311
+ documentation="The number of retracted requests.",
312
+ labelnames=labels.keys(),
313
+ )
314
+ self.num_paused_reqs = Gauge(
315
+ name="sglang:num_paused_reqs",
316
+ documentation="The number of paused requests by async weight sync.",
317
+ labelnames=labels.keys(),
318
+ )
319
+
320
+ # Utilization
321
+ self.utilization = Gauge(
322
+ name="sglang:utilization",
323
+ documentation="The utilization.",
324
+ labelnames=labels.keys(),
325
+ multiprocess_mode="mostrecent",
326
+ )
327
+ self.max_running_requests_under_SLO = Gauge(
328
+ name="sglang:max_running_requests_under_SLO",
329
+ documentation="The maximum number of running requests under SLO.",
330
+ labelnames=labels.keys(),
331
+ multiprocess_mode="mostrecent",
332
+ )
333
+
334
+ # Engine startup
335
+ self.engine_startup_time = Gauge(
336
+ name="sglang:engine_startup_time",
337
+ documentation="The time taken for the engine to start up.",
338
+ labelnames=labels.keys(),
339
+ multiprocess_mode="mostrecent",
340
+ )
341
+ self.engine_load_weights_time = Gauge(
342
+ name="sglang:engine_load_weights_time",
343
+ documentation="The time taken for the engine to load weights.",
344
+ labelnames=labels.keys(),
345
+ multiprocess_mode="mostrecent",
346
+ )
347
+
348
+ # Additional queueing time histogram
349
+ self.queue_time = Histogram(
350
+ name="sglang:queue_time_s",
351
+ documentation="Histogram of queueing time in seconds.",
352
+ labelnames=labels.keys(),
353
+ buckets=[
354
+ 0.0,
355
+ 0.1,
356
+ 0.2,
357
+ 0.5,
358
+ 1,
359
+ 2,
360
+ 3,
361
+ 4,
362
+ 5,
363
+ 10,
364
+ 15,
365
+ 20,
366
+ 30,
367
+ 40,
368
+ 50,
369
+ 60,
370
+ 70,
371
+ 80,
372
+ 90,
373
+ 100,
374
+ 200,
375
+ 300,
376
+ 400,
377
+ 500,
378
+ 600,
379
+ 700,
380
+ 800,
381
+ 900,
382
+ 1000,
383
+ 1200,
384
+ 1400,
385
+ 1600,
386
+ 1800,
387
+ 2000,
388
+ 2500,
389
+ 3000,
390
+ ],
391
+ )
392
+
393
+ # Grammar metrics
394
+ self.grammar_compilation_time = Histogram(
395
+ name="sglang:grammar_compilation_time_seconds",
396
+ documentation="Histogram of grammar compilation time in seconds.",
397
+ labelnames=labels.keys(),
398
+ buckets=[
399
+ 0.0,
400
+ 0.01,
401
+ 0.02,
402
+ 0.05,
403
+ 0.1,
404
+ 0.2,
405
+ 0.5,
406
+ 1,
407
+ 2,
408
+ 5,
409
+ 10,
410
+ 20,
411
+ 30,
412
+ 60,
413
+ 90,
414
+ 120,
415
+ 240,
416
+ ],
417
+ )
418
+ self.num_grammar_cache_hit = Counter(
419
+ name="sglang:num_grammar_cache_hit_total",
420
+ documentation="Number of grammar cache hits.",
421
+ labelnames=labels.keys(),
422
+ )
423
+ self.num_grammar_aborted = Counter(
424
+ name="sglang:num_grammar_aborted_total",
425
+ documentation="Number of grammar aborted requests.",
426
+ labelnames=labels.keys(),
427
+ )
428
+ self.num_grammar_total = Counter(
429
+ name="sglang:num_grammar_total",
430
+ documentation="Number of the total grammar requests.",
431
+ labelnames=labels.keys(),
432
+ )
433
+ self.grammar_schema_count = Histogram(
434
+ name="sglang:grammar_schema_count",
435
+ documentation="Histogram of grammar schema count.",
436
+ labelnames=labels.keys(),
437
+ buckets=[
438
+ 0,
439
+ 1,
440
+ 2,
441
+ 5,
442
+ 10,
443
+ 20,
444
+ 30,
445
+ 40,
446
+ 60,
447
+ 80,
448
+ 100,
449
+ 120,
450
+ 140,
451
+ 160,
452
+ 180,
453
+ 200,
454
+ 300,
455
+ 400,
456
+ 500,
457
+ 700,
458
+ 1000,
459
+ ],
460
+ )
461
+ self.grammar_ebnf_size = Histogram(
462
+ name="sglang:grammar_ebnf_size",
463
+ documentation="Histogram of grammar EBNF size.",
464
+ labelnames=labels.keys(),
465
+ buckets=[
466
+ 0,
467
+ 50,
468
+ 100,
469
+ 200,
470
+ 300,
471
+ 500,
472
+ 1000,
473
+ 2000,
474
+ 3000,
475
+ 5000,
476
+ 10000,
477
+ 20000,
478
+ 30000,
479
+ 50000,
480
+ 100000,
481
+ ],
482
+ )
483
+
484
+ tree_traversal_time_buckets = [
485
+ 0.0,
486
+ 0.01,
487
+ 0.02,
488
+ 0.05,
489
+ 0.1,
490
+ 0.2,
491
+ 0.5,
492
+ 1,
493
+ 2,
494
+ 5,
495
+ 10,
496
+ 15,
497
+ 30,
498
+ 60,
499
+ 90,
500
+ 120,
501
+ 240,
502
+ ]
503
+ self.grammar_tree_traversal_time_avg = Histogram(
504
+ name="sglang:grammar_tree_traversal_time_avg",
505
+ documentation="Histogram of average grammar tree traversal time in seconds.",
506
+ labelnames=labels.keys(),
507
+ buckets=tree_traversal_time_buckets,
508
+ )
509
+ self.grammar_tree_traversal_time_max = Histogram(
510
+ name="sglang:grammar_tree_traversal_time_max",
511
+ documentation="Histogram of max grammar tree traversal time in seconds.",
512
+ labelnames=labels.keys(),
513
+ buckets=tree_traversal_time_buckets,
514
+ )
270
515
 
271
516
  def _log_gauge(self, gauge, data: Union[int, float]) -> None:
272
517
  # Convenience function for logging to gauge.
273
518
  gauge.labels(**self.labels).set(data)
274
519
 
520
+ def log_histogram(self, histogram, data: Union[int, float]) -> None:
521
+ histogram.labels(**self.labels).observe(data)
522
+
275
523
  def increment_bootstrap_failed_reqs(self) -> None:
276
524
  self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
277
525
 
@@ -282,14 +530,20 @@ class SchedulerMetricsCollector:
282
530
  self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
283
531
  self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
284
532
  self._log_gauge(self.token_usage, stats.token_usage)
533
+ self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
285
534
  self._log_gauge(self.gen_throughput, stats.gen_throughput)
286
535
  self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
287
536
  self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
537
+ self._log_gauge(
538
+ self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
539
+ )
288
540
  self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
541
+ self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
542
+
543
+ # Speculative decoding
289
544
  self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
290
- self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
291
545
 
292
- # Disaggregation metrics
546
+ # PD disaggregation
293
547
  self._log_gauge(
294
548
  self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
295
549
  )
@@ -302,14 +556,59 @@ class SchedulerMetricsCollector:
302
556
  self._log_gauge(
303
557
  self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
304
558
  )
559
+ self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
560
+ self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
561
+
562
+ # Retract
563
+ self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
564
+ self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
565
+ self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
566
+
567
+ # Utilization
568
+ self._log_gauge(self.utilization, stats.utilization)
569
+ if stats.max_running_requests_under_SLO is not None:
570
+ self._log_gauge(
571
+ self.max_running_requests_under_SLO,
572
+ stats.max_running_requests_under_SLO,
573
+ )
574
+
575
+ # Engine startup time
576
+ self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
577
+ if stats.engine_load_weights_time is not None:
578
+ self._log_gauge(
579
+ self.engine_load_weights_time, stats.engine_load_weights_time
580
+ )
305
581
 
306
582
  self.last_log_time = time.perf_counter()
307
583
 
584
+ def log_grammar_stats(self, grammar_stats) -> None:
585
+ # Duck-typed GrammarStats to avoid cross-package dependency
586
+ if getattr(grammar_stats, "compilation_time", None) is not None:
587
+ self.log_histogram(
588
+ self.grammar_compilation_time, grammar_stats.compilation_time
589
+ )
590
+ if getattr(grammar_stats, "schema_count", None) is not None:
591
+ self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
592
+ if getattr(grammar_stats, "ebnf_size", None) is not None:
593
+ self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
594
+ tree_times = getattr(grammar_stats, "tree_traversal_time", None)
595
+ if tree_times:
596
+ max_time = max(tree_times)
597
+ avg_time = sum(tree_times) / len(tree_times)
598
+ self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
599
+ self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
600
+ if getattr(grammar_stats, "is_cache_hit", False):
601
+ self.num_grammar_cache_hit.labels(**self.labels).inc(1)
602
+ if getattr(grammar_stats, "is_grammar_aborted", False):
603
+ self.num_grammar_aborted.labels(**self.labels).inc(1)
604
+ self.num_grammar_total.labels(**self.labels).inc(1)
605
+
308
606
 
309
607
  class TokenizerMetricsCollector:
310
608
  def __init__(
311
609
  self,
312
- labels: Dict[str, str],
610
+ server_args: Optional[ServerArgs] = None,
611
+ labels: Dict[str, str] = None,
313
612
  bucket_time_to_first_token: Optional[List[float]] = None,
314
613
  bucket_inter_token_latency: Optional[List[float]] = None,
315
614
  bucket_e2e_request_latency: Optional[List[float]] = None,
@@ -318,7 +617,7 @@ class TokenizerMetricsCollector:
318
617
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
319
618
  from prometheus_client import Counter, Histogram
320
619
 
321
- self.labels = labels
620
+ self.labels = labels or {}
322
621
  self.collect_tokens_histogram = collect_tokens_histogram
323
622
 
324
623
  self.prompt_tokens_total = Counter(
@@ -334,7 +633,7 @@ class TokenizerMetricsCollector:
334
633
  )
335
634
 
336
635
  if collect_tokens_histogram:
337
- bucket_prompt_tokens = [
636
+ default_bucket_prompt_tokens = [
338
637
  100,
339
638
  300,
340
639
  500,
@@ -358,39 +657,30 @@ class TokenizerMetricsCollector:
358
657
  30000,
359
658
  35000,
360
659
  40000,
660
+ 66000,
661
+ 99000,
662
+ 132000,
663
+ 300000,
664
+ 600000,
665
+ 900000,
666
+ 1100000,
361
667
  ]
362
668
  self.prompt_tokens_histogram = Histogram(
363
669
  name="sglang:prompt_tokens_histogram",
364
670
  documentation="Histogram of prompt token length.",
365
671
  labelnames=labels.keys(),
366
- buckets=bucket_prompt_tokens,
672
+ buckets=generate_buckets(
673
+ server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
674
+ ),
367
675
  )
368
- bucket_generation_tokens = [
369
- 100,
370
- 300,
371
- 500,
372
- 1000,
373
- 1200,
374
- 1500,
375
- 1700,
376
- 2000,
377
- 2500,
378
- 3000,
379
- 3500,
380
- 4000,
381
- 4500,
382
- 5000,
383
- 6000,
384
- 7000,
385
- 8000,
386
- 9000,
387
- 10000,
388
- ]
389
676
  self.generation_tokens_histogram = Histogram(
390
677
  name="sglang:generation_tokens_histogram",
391
678
  documentation="Histogram of generation token length.",
392
679
  labelnames=labels.keys(),
393
- buckets=bucket_generation_tokens,
680
+ buckets=generate_buckets(
681
+ server_args.generation_tokens_buckets,
682
+ default_bucket_prompt_tokens,
683
+ ),
394
684
  )
395
685
 
396
686
  self.cached_tokens_total = Counter(
@@ -459,7 +749,10 @@ class TokenizerMetricsCollector:
459
749
  100,
460
750
  200,
461
751
  400,
462
- 800,
752
+ 600,
753
+ 1200,
754
+ 1800,
755
+ 2400,
463
756
  ]
464
757
 
465
758
  if bucket_inter_token_latency is None:
@@ -510,38 +803,68 @@ class TokenizerMetricsCollector:
510
803
  buckets=bucket_e2e_request_latency,
511
804
  )
512
805
 
513
- def _log_histogram(self, histogram, data: Union[int, float]) -> None:
514
- histogram.labels(**self.labels).observe(data)
806
+ # Offline batch specific TTFB histogram
807
+ self.histogram_time_to_first_token_offline_batch = Histogram(
808
+ name="sglang:time_to_first_token_seconds_offline_batch",
809
+ documentation="Histogram of time to first token in seconds for offline batch requests.",
810
+ labelnames=labels.keys(),
811
+ buckets=bucket_time_to_first_token,
812
+ )
515
813
 
516
814
  def observe_one_finished_request(
517
815
  self,
816
+ labels: Dict[str, str],
518
817
  prompt_tokens: int,
519
818
  generation_tokens: int,
520
819
  cached_tokens: int,
521
820
  e2e_latency: float,
522
821
  has_grammar: bool,
523
822
  ):
524
- self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
525
- self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
823
+ self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
824
+ self.generation_tokens_total.labels(**labels).inc(generation_tokens)
526
825
  if cached_tokens > 0:
527
- self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
528
- self.num_requests_total.labels(**self.labels).inc(1)
826
+ self.cached_tokens_total.labels(**labels).inc(cached_tokens)
827
+ self.num_requests_total.labels(**labels).inc(1)
529
828
  if has_grammar:
530
- self.num_so_requests_total.labels(**self.labels).inc(1)
531
- self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
829
+ self.num_so_requests_total.labels(**labels).inc(1)
830
+ self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
532
831
  if self.collect_tokens_histogram:
533
- self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
534
- self._log_histogram(self.generation_tokens_histogram, generation_tokens)
535
-
536
- def observe_time_to_first_token(self, value: float):
537
- self.histogram_time_to_first_token.labels(**self.labels).observe(value)
832
+ self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
833
+ self.generation_tokens_histogram.labels(**labels).observe(
834
+ float(generation_tokens)
835
+ )
538
836
 
539
- def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
837
+ def observe_time_to_first_token(
838
+ self, labels: Dict[str, str], value: float, type: str = ""
839
+ ):
840
+ if type == "batch":
841
+ self.histogram_time_to_first_token_offline_batch.labels(**labels).observe(
842
+ value
843
+ )
844
+ else:
845
+ self.histogram_time_to_first_token.labels(**labels).observe(value)
846
+
847
+ def check_time_to_first_token_straggler(self, value: float) -> bool:
848
+ his = self.histogram_time_to_first_token.labels(**self.labels)
849
+ total_observations = sum(bucket._value for bucket in his._buckets)
850
+ if total_observations < 100:
851
+ return False
852
+ p99_threshold = total_observations * 0.99
853
+ cumulative_count = 0
854
+ for i, bucket in enumerate(his._buckets):
855
+ cumulative_count += bucket._value
856
+ if cumulative_count > p99_threshold:
857
+ return value >= his._upper_bounds[i]
858
+ return False
859
+
860
+ def observe_inter_token_latency(
861
+ self, labels: Dict[str, str], internval: float, num_new_tokens: int
862
+ ):
540
863
  adjusted_interval = internval / num_new_tokens
541
864
 
542
865
  # A faster version of the Histogram::observe which observes multiple values at the same time.
543
866
  # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
544
- his = self.histogram_inter_token_latency_seconds.labels(**self.labels)
867
+ his = self.histogram_inter_token_latency_seconds.labels(**labels)
545
868
  his._sum.inc(internval)
546
869
 
547
870
  for i, bound in enumerate(his._upper_bounds):
@@ -551,3 +874,105 @@ class TokenizerMetricsCollector:
551
874
 
552
875
  def observe_one_aborted_request(self):
553
876
  self.num_aborted_requests_total.labels(**self.labels).inc(1)
877
+
878
+
879
+ @dataclass
880
+ class StorageMetrics:
881
+ prefetch_pgs: List[int] = field(default_factory=list)
882
+ backup_pgs: List[int] = field(default_factory=list)
883
+ prefetch_bandwidth: List[float] = field(default_factory=list)
884
+ backup_bandwidth: List[float] = field(default_factory=list)
885
+
886
+
887
+ class StorageMetricsCollector:
888
+ def __init__(
889
+ self,
890
+ labels: Dict[str, str],
891
+ ):
892
+ from prometheus_client import Counter, Histogram
893
+
894
+ self.labels = labels
895
+
896
+ self.prefetched_tokens_total = Counter(
897
+ name="sglang:prefetched_tokens_total",
898
+ documentation="Number of prefetched prompt tokens.",
899
+ labelnames=labels.keys(),
900
+ )
901
+
902
+ self.backuped_tokens_total = Counter(
903
+ name="sglang:backuped_tokens_total",
904
+ documentation="Number of backuped tokens.",
905
+ labelnames=labels.keys(),
906
+ )
907
+
908
+ bucket_io = [
909
+ 1,
910
+ 5,
911
+ 10,
912
+ 50,
913
+ 100,
914
+ ]
915
+
916
+ bucket_bandwidth = [
917
+ 0.1,
918
+ 0.5,
919
+ 1,
920
+ 5,
921
+ 10,
922
+ 50,
923
+ 100,
924
+ ]
925
+
926
+ self.histogram_prefetch_pgs = Histogram(
927
+ name="sglang:prefetch_pgs",
928
+ documentation="Histogram of prefetch pages of batches.",
929
+ labelnames=labels.keys(),
930
+ buckets=bucket_io,
931
+ )
932
+
933
+ self.histogram_backup_pgs = Histogram(
934
+ name="sglang:backup_pgs",
935
+ documentation="Histogram of backup pages of batches.",
936
+ labelnames=labels.keys(),
937
+ buckets=bucket_io,
938
+ )
939
+
940
+ self.histogram_prefetch_bandwidth = Histogram(
941
+ name="sglang:prefetch_bandwidth",
942
+ documentation="Histogram of prefetch bandwidth in GB/s.",
943
+ labelnames=labels.keys(),
944
+ buckets=bucket_bandwidth,
945
+ )
946
+
947
+ self.histogram_backup_bandwidth = Histogram(
948
+ name="sglang:backup_bandwidth",
949
+ documentation="Histogram of backup bandwidth in GB/s.",
950
+ labelnames=labels.keys(),
951
+ buckets=bucket_bandwidth,
952
+ )
953
+
954
+ def log_prefetched_tokens(self, prefetched_tokens: int):
955
+ if prefetched_tokens > 0:
956
+ self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
957
+
958
+ def log_backuped_tokens(self, backuped_tokens: int):
959
+ if backuped_tokens > 0:
960
+ self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
961
+
962
+ def _log_histogram(self, histogram, data: Union[int, float]):
963
+ histogram.labels(**self.labels).observe(data)
964
+
965
+ def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
966
+ if storage_metrics is None:
967
+ return
968
+
969
+ assert isinstance(storage_metrics, StorageMetrics)
970
+
971
+ for v in storage_metrics.prefetch_pgs:
972
+ self._log_histogram(self.histogram_prefetch_pgs, v)
973
+ for v in storage_metrics.backup_pgs:
974
+ self._log_histogram(self.histogram_backup_pgs, v)
975
+ for v in storage_metrics.prefetch_bandwidth:
976
+ self._log_histogram(self.histogram_prefetch_bandwidth, v)
977
+ for v in storage_metrics.backup_bandwidth:
978
+ self._log_histogram(self.histogram_backup_bandwidth, v)